datachain 0.8.10__py3-none-any.whl → 0.8.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/cache.py CHANGED
@@ -22,15 +22,15 @@ def try_scandir(path):
22
22
  pass
23
23
 
24
24
 
25
- def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "DataChainCache":
25
+ def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
26
26
  cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
27
- return DataChainCache(cache_dir, tmp_dir=tmp_dir)
27
+ return Cache(cache_dir, tmp_dir=tmp_dir)
28
28
 
29
29
 
30
30
  @contextmanager
31
31
  def temporary_cache(
32
32
  tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
33
- ) -> Iterator["DataChainCache"]:
33
+ ) -> Iterator["Cache"]:
34
34
  cache = get_temp_cache(tmp_dir, prefix=prefix)
35
35
  try:
36
36
  yield cache
@@ -39,7 +39,7 @@ def temporary_cache(
39
39
  cache.destroy()
40
40
 
41
41
 
42
- class DataChainCache:
42
+ class Cache:
43
43
  def __init__(self, cache_dir: str, tmp_dir: str):
44
44
  self.odb = LocalHashFileDB(
45
45
  LocalFileSystem(),
@@ -3,7 +3,6 @@ from .catalog import (
3
3
  QUERY_SCRIPT_CANCELED_EXIT_CODE,
4
4
  QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
5
5
  Catalog,
6
- parse_edatachain_file,
7
6
  )
8
7
  from .loader import get_catalog
9
8
 
@@ -13,5 +12,4 @@ __all__ = [
13
12
  "QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
14
13
  "Catalog",
15
14
  "get_catalog",
16
- "parse_edatachain_file",
17
15
  ]
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import os.path
6
6
  import posixpath
7
+ import signal
7
8
  import subprocess
8
9
  import sys
9
10
  import time
@@ -26,11 +27,10 @@ from uuid import uuid4
26
27
 
27
28
  import requests
28
29
  import sqlalchemy as sa
29
- import yaml
30
30
  from sqlalchemy import Column
31
31
  from tqdm.auto import tqdm
32
32
 
33
- from datachain.cache import DataChainCache
33
+ from datachain.cache import Cache
34
34
  from datachain.client import Client
35
35
  from datachain.dataset import (
36
36
  DATASET_PREFIX,
@@ -38,7 +38,6 @@ from datachain.dataset import (
38
38
  DatasetDependency,
39
39
  DatasetListRecord,
40
40
  DatasetRecord,
41
- DatasetStats,
42
41
  DatasetStatus,
43
42
  StorageURI,
44
43
  create_dataset_uri,
@@ -57,7 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
57
56
  from datachain.nodes_thread_pool import NodesThreadPool
58
57
  from datachain.remote.studio import StudioClient
59
58
  from datachain.sql.types import DateTime, SQLType
60
- from datachain.utils import DataChainDir, datachain_paths_join
59
+ from datachain.utils import DataChainDir
61
60
 
62
61
  from .datasource import DataSource
63
62
 
@@ -73,7 +72,6 @@ if TYPE_CHECKING:
73
72
  logger = logging.getLogger("datachain")
74
73
 
75
74
  DEFAULT_DATASET_DIR = "dataset"
76
- DATASET_FILE_SUFFIX = ".edatachain"
77
75
 
78
76
  TTL_INT = 4 * 60 * 60
79
77
 
@@ -99,6 +97,47 @@ def noop(_: str):
99
97
  pass
100
98
 
101
99
 
100
+ class TerminationSignal(RuntimeError): # noqa: N818
101
+ def __init__(self, signal):
102
+ self.signal = signal
103
+ super().__init__("Received termination signal", signal)
104
+
105
+ def __repr__(self):
106
+ return f"{self.__class__.__name__}({self.signal})"
107
+
108
+
109
+ if sys.platform == "win32":
110
+ SIGINT = signal.CTRL_C_EVENT
111
+ else:
112
+ SIGINT = signal.SIGINT
113
+
114
+
115
+ def shutdown_process(
116
+ proc: subprocess.Popen,
117
+ interrupt_timeout: Optional[int] = None,
118
+ terminate_timeout: Optional[int] = None,
119
+ ) -> int:
120
+ """Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
121
+
122
+ logger.info("sending interrupt signal to the process %s", proc.pid)
123
+ proc.send_signal(SIGINT)
124
+
125
+ logger.info("waiting for the process %s to finish", proc.pid)
126
+ try:
127
+ return proc.wait(interrupt_timeout)
128
+ except subprocess.TimeoutExpired:
129
+ logger.info(
130
+ "timed out waiting, sending terminate signal to the process %s", proc.pid
131
+ )
132
+ proc.terminate()
133
+ try:
134
+ return proc.wait(terminate_timeout)
135
+ except subprocess.TimeoutExpired:
136
+ logger.info("timed out waiting, killing the process %s", proc.pid)
137
+ proc.kill()
138
+ return proc.wait()
139
+
140
+
102
141
  def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
103
142
  buffer = b""
104
143
  while byt := stream.read(1): # Read one byte at a time
@@ -247,7 +286,6 @@ class NodeGroup:
247
286
  # The source path within the bucket
248
287
  # (not including the bucket name or s3:// prefix)
249
288
  source_path: str = ""
250
- is_edatachain: bool = False
251
289
  dataset_name: Optional[str] = None
252
290
  dataset_version: Optional[int] = None
253
291
  instantiated_nodes: Optional[list[NodeWithPath]] = None
@@ -272,55 +310,11 @@ class NodeGroup:
272
310
  self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
273
311
 
274
312
 
275
- def check_output_dataset_file(
276
- output: str,
277
- force: bool = False,
278
- dataset_filename: Optional[str] = None,
279
- skip_check_edatachain: bool = False,
280
- ) -> str:
281
- """
282
- Checks the dataset filename for existence or if it should be force-overwritten.
283
- """
284
- dataset_file = (
285
- dataset_filename if dataset_filename else output + DATASET_FILE_SUFFIX
286
- )
287
- if not skip_check_edatachain and os.path.exists(dataset_file):
288
- if force:
289
- os.remove(dataset_file)
290
- else:
291
- raise RuntimeError(f"Output dataset file already exists: {dataset_file}")
292
- return dataset_file
293
-
294
-
295
- def parse_edatachain_file(filename: str) -> list[dict[str, Any]]:
296
- with open(filename, encoding="utf-8") as f:
297
- contents = yaml.safe_load(f)
298
-
299
- if not isinstance(contents, list):
300
- contents = [contents]
301
-
302
- for entry in contents:
303
- if not isinstance(entry, dict):
304
- raise TypeError(
305
- "Failed parsing EDataChain file, "
306
- "each data source entry must be a dictionary"
307
- )
308
- if "data-source" not in entry or "files" not in entry:
309
- raise ValueError(
310
- "Failed parsing EDataChain file, "
311
- "each data source entry must contain the "
312
- '"data-source" and "files" keys'
313
- )
314
-
315
- return contents
316
-
317
-
318
313
  def prepare_output_for_cp(
319
314
  node_groups: list[NodeGroup],
320
315
  output: str,
321
316
  force: bool = False,
322
- edatachain_only: bool = False,
323
- no_edatachain_file: bool = False,
317
+ no_cp: bool = False,
324
318
  ) -> tuple[bool, Optional[str]]:
325
319
  total_node_count = 0
326
320
  for node_group in node_groups:
@@ -333,7 +327,7 @@ def prepare_output_for_cp(
333
327
  always_copy_dir_contents = False
334
328
  copy_to_filename = None
335
329
 
336
- if edatachain_only:
330
+ if no_cp:
337
331
  return always_copy_dir_contents, copy_to_filename
338
332
 
339
333
  if not os.path.isdir(output):
@@ -358,10 +352,6 @@ def prepare_output_for_cp(
358
352
  copy_to_filename = output
359
353
  else:
360
354
  raise FileNotFoundError(f"Is not a directory: {output}")
361
-
362
- if copy_to_filename and not no_edatachain_file:
363
- raise RuntimeError("File to file cp not supported with .edatachain files!")
364
-
365
355
  return always_copy_dir_contents, copy_to_filename
366
356
 
367
357
 
@@ -465,8 +455,6 @@ def instantiate_node_groups(
465
455
  copy_to_filename,
466
456
  recursive,
467
457
  copy_dir_contents,
468
- source_path,
469
- node_group.is_edatachain,
470
458
  node_group.is_dataset,
471
459
  )
472
460
  if not virtual_only:
@@ -484,24 +472,6 @@ def instantiate_node_groups(
484
472
  instantiate_progress_bar.close()
485
473
 
486
474
 
487
- def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
488
- metafile_data = []
489
- for node_group in node_groups:
490
- if not node_group.sources:
491
- continue
492
- listing: Listing = node_group.listing
493
- metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
494
- for node in node_group.instantiated_nodes:
495
- if not node.n.is_dir:
496
- metafile_group["files"].append( # type: ignore [attr-defined]
497
- node.get_metafile_data()
498
- )
499
- if metafile_group["files"]:
500
- metafile_data.append(metafile_group)
501
-
502
- return metafile_data
503
-
504
-
505
475
  def find_column_to_str( # noqa: PLR0911
506
476
  row: tuple[Any, ...], field_lookup: dict[str, int], src: DataSource, column: str
507
477
  ) -> str:
@@ -536,7 +506,7 @@ def find_column_to_str( # noqa: PLR0911
536
506
  return ""
537
507
 
538
508
 
539
- def clone_catalog_with_cache(catalog: "Catalog", cache: "DataChainCache") -> "Catalog":
509
+ def clone_catalog_with_cache(catalog: "Catalog", cache: "Cache") -> "Catalog":
540
510
  clone = catalog.copy()
541
511
  clone.cache = cache
542
512
  return clone
@@ -559,7 +529,7 @@ class Catalog:
559
529
  datachain_dir.init()
560
530
  self.metastore = metastore
561
531
  self._warehouse = warehouse
562
- self.cache = DataChainCache(datachain_dir.cache, datachain_dir.tmp)
532
+ self.cache = Cache(datachain_dir.cache, datachain_dir.tmp)
563
533
  self.client_config = client_config if client_config is not None else {}
564
534
  self._init_params = {
565
535
  "cache_dir": cache_dir,
@@ -703,22 +673,8 @@ class Catalog:
703
673
  enlisted_sources: list[tuple[bool, bool, Any]] = []
704
674
  client_config = client_config or self.client_config
705
675
  for src in sources: # Opt: parallel
706
- if src.endswith(DATASET_FILE_SUFFIX) and os.path.isfile(src):
707
- # TODO: Also allow using EDataChain files from cloud locations?
708
- edatachain_data = parse_edatachain_file(src)
709
- indexed_sources = []
710
- for ds in edatachain_data:
711
- listing, _, source_path = self.enlist_source(
712
- ds["data-source"]["uri"],
713
- update,
714
- client_config=client_config,
715
- )
716
- paths = datachain_paths_join(
717
- source_path, (f["name"] for f in ds["files"])
718
- )
719
- indexed_sources.append((listing, source_path, paths))
720
- enlisted_sources.append((True, False, indexed_sources))
721
- elif src.startswith("ds://"):
676
+ listing: Optional[Listing]
677
+ if src.startswith("ds://"):
722
678
  ds_name, ds_version = parse_dataset_uri(src)
723
679
  dataset = self.get_dataset(ds_name)
724
680
  if not ds_version:
@@ -796,7 +752,6 @@ class Catalog:
796
752
  listing.client,
797
753
  dsrc,
798
754
  source_path,
799
- is_edatachain=True,
800
755
  )
801
756
  )
802
757
  else:
@@ -1279,17 +1234,6 @@ class Catalog:
1279
1234
  dataset = self.get_dataset(name)
1280
1235
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1281
1236
 
1282
- def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
1283
- """
1284
- Returns tuple with dataset stats: total number of rows and total dataset size.
1285
- """
1286
- dataset = self.get_dataset(name)
1287
- dataset_version = dataset.get_version(version or dataset.latest_version)
1288
- return DatasetStats(
1289
- num_objects=dataset_version.num_objects,
1290
- size=dataset_version.size,
1291
- )
1292
-
1293
1237
  def remove_dataset(
1294
1238
  self,
1295
1239
  name: str,
@@ -1360,8 +1304,6 @@ class Catalog:
1360
1304
  local_ds_version: Optional[int] = None,
1361
1305
  cp: bool = False,
1362
1306
  force: bool = False,
1363
- edatachain: bool = False,
1364
- edatachain_file: Optional[str] = None,
1365
1307
  *,
1366
1308
  client_config=None,
1367
1309
  ) -> None:
@@ -1373,8 +1315,6 @@ class Catalog:
1373
1315
  [ds_uri],
1374
1316
  output,
1375
1317
  force=force,
1376
- no_edatachain_file=not edatachain,
1377
- edatachain_file=edatachain_file,
1378
1318
  client_config=client_config,
1379
1319
  )
1380
1320
  print(f"Dataset {ds_uri} instantiated locally to {output}")
@@ -1439,19 +1379,12 @@ class Catalog:
1439
1379
  except DatasetNotFoundError:
1440
1380
  pass
1441
1381
 
1442
- stats_response = studio_client.dataset_stats(
1443
- remote_ds_name, remote_ds_version.version
1444
- )
1445
- if not stats_response.ok:
1446
- raise_remote_error(stats_response.message)
1447
- ds_stats = stats_response.data
1448
-
1449
1382
  dataset_save_progress_bar = tqdm(
1450
1383
  desc=f"Saving dataset {remote_ds_uri} locally: ",
1451
1384
  unit=" rows",
1452
1385
  unit_scale=True,
1453
1386
  unit_divisor=1000,
1454
- total=ds_stats.num_objects, # type: ignore [union-attr]
1387
+ total=remote_ds_version.num_objects, # type: ignore [union-attr]
1455
1388
  leave=False,
1456
1389
  )
1457
1390
 
@@ -1541,8 +1474,6 @@ class Catalog:
1541
1474
  recursive: bool = False,
1542
1475
  no_glob: bool = False,
1543
1476
  no_cp: bool = False,
1544
- edatachain: bool = False,
1545
- edatachain_file: Optional[str] = None,
1546
1477
  *,
1547
1478
  client_config=None,
1548
1479
  ) -> None:
@@ -1551,9 +1482,8 @@ class Catalog:
1551
1482
  them into the dataset folder.
1552
1483
  It also adds those files to a dataset in database, which is
1553
1484
  created if doesn't exist yet
1554
- Optionally, it creates a .edatachain file
1555
1485
  """
1556
- if not no_cp or edatachain:
1486
+ if not no_cp:
1557
1487
  self.cp(
1558
1488
  sources,
1559
1489
  output,
@@ -1561,9 +1491,7 @@ class Catalog:
1561
1491
  update=update,
1562
1492
  recursive=recursive,
1563
1493
  no_glob=no_glob,
1564
- edatachain_only=no_cp,
1565
- no_edatachain_file=not edatachain,
1566
- edatachain_file=edatachain_file,
1494
+ no_cp=no_cp,
1567
1495
  client_config=client_config,
1568
1496
  )
1569
1497
  else:
@@ -1588,6 +1516,8 @@ class Catalog:
1588
1516
  output_hook: Callable[[str], None] = noop,
1589
1517
  params: Optional[dict[str, str]] = None,
1590
1518
  job_id: Optional[str] = None,
1519
+ interrupt_timeout: Optional[int] = None,
1520
+ terminate_timeout: Optional[int] = None,
1591
1521
  ) -> None:
1592
1522
  cmd = [python_executable, "-c", query_script]
1593
1523
  env = dict(env or os.environ)
@@ -1601,13 +1531,48 @@ class Catalog:
1601
1531
  if capture_output:
1602
1532
  popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1603
1533
 
1534
+ def raise_termination_signal(sig: int, _: Any) -> NoReturn:
1535
+ raise TerminationSignal(sig)
1536
+
1537
+ thread: Optional[Thread] = None
1604
1538
  with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
1605
- if capture_output:
1606
- args = (proc.stdout, output_hook)
1607
- thread = Thread(target=_process_stream, args=args, daemon=True)
1608
- thread.start()
1609
- thread.join() # wait for the reader thread
1539
+ logger.info("Starting process %s", proc.pid)
1540
+
1541
+ orig_sigint_handler = signal.getsignal(signal.SIGINT)
1542
+ # ignore SIGINT in the main process.
1543
+ # In the terminal, SIGINTs are received by all the processes in
1544
+ # the foreground process group, so the script will receive the signal too.
1545
+ # (If we forward the signal to the child, it will receive it twice.)
1546
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
1610
1547
 
1548
+ orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
1549
+ signal.signal(signal.SIGTERM, raise_termination_signal)
1550
+ try:
1551
+ if capture_output:
1552
+ args = (proc.stdout, output_hook)
1553
+ thread = Thread(target=_process_stream, args=args, daemon=True)
1554
+ thread.start()
1555
+
1556
+ proc.wait()
1557
+ except TerminationSignal as exc:
1558
+ signal.signal(signal.SIGTERM, orig_sigterm_handler)
1559
+ signal.signal(signal.SIGINT, orig_sigint_handler)
1560
+ logging.info("Shutting down process %s, received %r", proc.pid, exc)
1561
+ # Rather than forwarding the signal to the child, we try to shut it down
1562
+ # gracefully. This is because we consider the script to be interactive
1563
+ # and special, so we give it time to cleanup before exiting.
1564
+ shutdown_process(proc, interrupt_timeout, terminate_timeout)
1565
+ if proc.returncode:
1566
+ raise QueryScriptCancelError(
1567
+ "Query script was canceled by user", return_code=proc.returncode
1568
+ ) from exc
1569
+ finally:
1570
+ signal.signal(signal.SIGTERM, orig_sigterm_handler)
1571
+ signal.signal(signal.SIGINT, orig_sigint_handler)
1572
+ if thread:
1573
+ thread.join() # wait for the reader thread
1574
+
1575
+ logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1611
1576
  if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1612
1577
  raise QueryScriptCancelError(
1613
1578
  "Query script was canceled by user",
@@ -1626,17 +1591,14 @@ class Catalog:
1626
1591
  force: bool = False,
1627
1592
  update: bool = False,
1628
1593
  recursive: bool = False,
1629
- edatachain_file: Optional[str] = None,
1630
- edatachain_only: bool = False,
1631
- no_edatachain_file: bool = False,
1594
+ no_cp: bool = False,
1632
1595
  no_glob: bool = False,
1633
1596
  *,
1634
- client_config=None,
1635
- ) -> list[dict[str, Any]]:
1597
+ client_config: Optional["dict"] = None,
1598
+ ) -> None:
1636
1599
  """
1637
1600
  This function copies files from cloud sources to local destination directory
1638
1601
  If cloud source is not indexed, or has expired index, it runs indexing
1639
- It also creates .edatachain file by default, if not specified differently
1640
1602
  """
1641
1603
  client_config = client_config or self.client_config
1642
1604
  node_groups = self.enlist_sources_grouped(
@@ -1647,17 +1609,11 @@ class Catalog:
1647
1609
  )
1648
1610
 
1649
1611
  always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
1650
- node_groups, output, force, edatachain_only, no_edatachain_file
1651
- )
1652
- dataset_file = check_output_dataset_file(
1653
- output, force, edatachain_file, no_edatachain_file
1612
+ node_groups, output, force, no_cp
1654
1613
  )
1655
-
1656
1614
  total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
1657
-
1658
- if total_files == 0:
1659
- # Nothing selected to cp
1660
- return []
1615
+ if not total_files:
1616
+ return
1661
1617
 
1662
1618
  desc_max_len = max(len(output) + 16, 19)
1663
1619
  bar_format = (
@@ -1667,7 +1623,7 @@ class Catalog:
1667
1623
  "[{elapsed}<{remaining}, {rate_fmt:>8}]"
1668
1624
  )
1669
1625
 
1670
- if not edatachain_only:
1626
+ if not no_cp:
1671
1627
  with get_download_bar(bar_format, total_size) as pbar:
1672
1628
  for node_group in node_groups:
1673
1629
  node_group.download(recursive=recursive, pbar=pbar)
@@ -1679,21 +1635,10 @@ class Catalog:
1679
1635
  total_files,
1680
1636
  force,
1681
1637
  recursive,
1682
- edatachain_only,
1638
+ no_cp,
1683
1639
  always_copy_dir_contents,
1684
1640
  copy_to_filename,
1685
1641
  )
1686
- if no_edatachain_file:
1687
- return []
1688
-
1689
- metafile_data = compute_metafile_data(node_groups)
1690
- if metafile_data:
1691
- # Don't write the metafile if nothing was copied
1692
- print(f"Creating '{dataset_file}'")
1693
- with open(dataset_file, "w", encoding="utf-8") as fd:
1694
- yaml.dump(metafile_data, fd, sort_keys=False)
1695
-
1696
- return metafile_data
1697
1642
 
1698
1643
  def du(
1699
1644
  self,
datachain/cli/__init__.py CHANGED
@@ -11,7 +11,6 @@ from datachain.telemetry import telemetry
11
11
  from .commands import (
12
12
  clear_cache,
13
13
  completion,
14
- dataset_stats,
15
14
  du,
16
15
  edit_dataset,
17
16
  garbage_collect,
@@ -47,9 +46,13 @@ def main(argv: Optional[list[str]] = None) -> int:
47
46
  logging_level = get_logging_level(args)
48
47
  logger.setLevel(logging_level)
49
48
 
50
- client_config = {
51
- "anon": args.anon,
52
- }
49
+ client_config = (
50
+ {
51
+ "anon": args.anon,
52
+ }
53
+ if getattr(args, "anon", False)
54
+ else {}
55
+ )
53
56
 
54
57
  if args.debug_sql:
55
58
  # This also sets this environment variable for any subprocesses
@@ -107,9 +110,6 @@ def handle_cp_command(args, catalog):
107
110
  force=bool(args.force),
108
111
  update=bool(args.update),
109
112
  recursive=bool(args.recursive),
110
- edatachain_file=None,
111
- edatachain_only=False,
112
- no_edatachain_file=True,
113
113
  no_glob=args.no_glob,
114
114
  )
115
115
 
@@ -181,13 +181,6 @@ def handle_dataset_command(args, catalog):
181
181
  all=args.all,
182
182
  team=args.team,
183
183
  ),
184
- "stats": lambda: dataset_stats(
185
- catalog,
186
- args.name,
187
- args.version,
188
- show_bytes=args.bytes,
189
- si=args.si,
190
- ),
191
184
  }
192
185
 
193
186
  handler = dataset_commands.get(args.datasets_cmd)
@@ -1,5 +1,4 @@
1
1
  from .datasets import (
2
- dataset_stats,
3
2
  edit_dataset,
4
3
  list_datasets,
5
4
  list_datasets_local,
@@ -15,7 +14,6 @@ from .show import show
15
14
  __all__ = [
16
15
  "clear_cache",
17
16
  "completion",
18
- "dataset_stats",
19
17
  "du",
20
18
  "edit_dataset",
21
19
  "garbage_collect",
@@ -3,8 +3,6 @@ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  from tabulate import tabulate
5
5
 
6
- from datachain import utils
7
-
8
6
  if TYPE_CHECKING:
9
7
  from datachain.catalog import Catalog
10
8
 
@@ -109,20 +107,3 @@ def edit_dataset(
109
107
 
110
108
  if (all or studio) and token:
111
109
  edit_studio_dataset(team, name, new_name, description, labels)
112
-
113
-
114
- def dataset_stats(
115
- catalog: "Catalog",
116
- name: str,
117
- version: int,
118
- show_bytes=False,
119
- si=False,
120
- ):
121
- stats = catalog.dataset_stats(name, version)
122
-
123
- if stats:
124
- print(f"Number of objects: {stats.num_objects}")
125
- if show_bytes:
126
- print(f"Total objects size: {stats.size}")
127
- else:
128
- print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")