datachain 0.8.10__py3-none-any.whl → 0.8.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +4 -4
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +102 -138
- datachain/cli/__init__.py +7 -6
- datachain/cli/parser/__init__.py +27 -16
- datachain/cli/parser/studio.py +7 -6
- datachain/cli/parser/utils.py +18 -0
- datachain/client/fsspec.py +11 -8
- datachain/client/local.py +4 -4
- datachain/data_storage/schema.py +1 -1
- datachain/dataset.py +1 -1
- datachain/error.py +12 -0
- datachain/func/__init__.py +2 -1
- datachain/func/conditional.py +67 -23
- datachain/func/func.py +17 -5
- datachain/lib/dc.py +24 -4
- datachain/lib/file.py +16 -0
- datachain/lib/listing.py +30 -12
- datachain/lib/pytorch.py +1 -1
- datachain/lib/udf.py +1 -1
- datachain/listing.py +1 -13
- datachain/node.py +0 -15
- datachain/nodes_fetcher.py +2 -2
- datachain/remote/studio.py +1 -1
- datachain/studio.py +1 -1
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/METADATA +3 -7
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/RECORD +31 -31
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/LICENSE +0 -0
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/WHEEL +0 -0
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.10.dist-info → datachain-0.8.11.dist-info}/top_level.txt +0 -0
datachain/cache.py
CHANGED
|
@@ -22,15 +22,15 @@ def try_scandir(path):
|
|
|
22
22
|
pass
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "
|
|
25
|
+
def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
|
|
26
26
|
cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
|
|
27
|
-
return
|
|
27
|
+
return Cache(cache_dir, tmp_dir=tmp_dir)
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
@contextmanager
|
|
31
31
|
def temporary_cache(
|
|
32
32
|
tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
|
|
33
|
-
) -> Iterator["
|
|
33
|
+
) -> Iterator["Cache"]:
|
|
34
34
|
cache = get_temp_cache(tmp_dir, prefix=prefix)
|
|
35
35
|
try:
|
|
36
36
|
yield cache
|
|
@@ -39,7 +39,7 @@ def temporary_cache(
|
|
|
39
39
|
cache.destroy()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class
|
|
42
|
+
class Cache:
|
|
43
43
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
44
44
|
self.odb = LocalHashFileDB(
|
|
45
45
|
LocalFileSystem(),
|
datachain/catalog/__init__.py
CHANGED
|
@@ -3,7 +3,6 @@ from .catalog import (
|
|
|
3
3
|
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
4
4
|
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
|
|
5
5
|
Catalog,
|
|
6
|
-
parse_edatachain_file,
|
|
7
6
|
)
|
|
8
7
|
from .loader import get_catalog
|
|
9
8
|
|
|
@@ -13,5 +12,4 @@ __all__ = [
|
|
|
13
12
|
"QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
|
|
14
13
|
"Catalog",
|
|
15
14
|
"get_catalog",
|
|
16
|
-
"parse_edatachain_file",
|
|
17
15
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import os.path
|
|
6
6
|
import posixpath
|
|
7
|
+
import signal
|
|
7
8
|
import subprocess
|
|
8
9
|
import sys
|
|
9
10
|
import time
|
|
@@ -26,11 +27,10 @@ from uuid import uuid4
|
|
|
26
27
|
|
|
27
28
|
import requests
|
|
28
29
|
import sqlalchemy as sa
|
|
29
|
-
import yaml
|
|
30
30
|
from sqlalchemy import Column
|
|
31
31
|
from tqdm.auto import tqdm
|
|
32
32
|
|
|
33
|
-
from datachain.cache import
|
|
33
|
+
from datachain.cache import Cache
|
|
34
34
|
from datachain.client import Client
|
|
35
35
|
from datachain.dataset import (
|
|
36
36
|
DATASET_PREFIX,
|
|
@@ -57,7 +57,7 @@ from datachain.node import DirType, Node, NodeWithPath
|
|
|
57
57
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
58
58
|
from datachain.remote.studio import StudioClient
|
|
59
59
|
from datachain.sql.types import DateTime, SQLType
|
|
60
|
-
from datachain.utils import DataChainDir
|
|
60
|
+
from datachain.utils import DataChainDir
|
|
61
61
|
|
|
62
62
|
from .datasource import DataSource
|
|
63
63
|
|
|
@@ -73,7 +73,6 @@ if TYPE_CHECKING:
|
|
|
73
73
|
logger = logging.getLogger("datachain")
|
|
74
74
|
|
|
75
75
|
DEFAULT_DATASET_DIR = "dataset"
|
|
76
|
-
DATASET_FILE_SUFFIX = ".edatachain"
|
|
77
76
|
|
|
78
77
|
TTL_INT = 4 * 60 * 60
|
|
79
78
|
|
|
@@ -99,6 +98,47 @@ def noop(_: str):
|
|
|
99
98
|
pass
|
|
100
99
|
|
|
101
100
|
|
|
101
|
+
class TerminationSignal(RuntimeError): # noqa: N818
|
|
102
|
+
def __init__(self, signal):
|
|
103
|
+
self.signal = signal
|
|
104
|
+
super().__init__("Received termination signal", signal)
|
|
105
|
+
|
|
106
|
+
def __repr__(self):
|
|
107
|
+
return f"{self.__class__.__name__}({self.signal})"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
if sys.platform == "win32":
|
|
111
|
+
SIGINT = signal.CTRL_C_EVENT
|
|
112
|
+
else:
|
|
113
|
+
SIGINT = signal.SIGINT
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def shutdown_process(
|
|
117
|
+
proc: subprocess.Popen,
|
|
118
|
+
interrupt_timeout: Optional[int] = None,
|
|
119
|
+
terminate_timeout: Optional[int] = None,
|
|
120
|
+
) -> int:
|
|
121
|
+
"""Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
|
|
122
|
+
|
|
123
|
+
logger.info("sending interrupt signal to the process %s", proc.pid)
|
|
124
|
+
proc.send_signal(SIGINT)
|
|
125
|
+
|
|
126
|
+
logger.info("waiting for the process %s to finish", proc.pid)
|
|
127
|
+
try:
|
|
128
|
+
return proc.wait(interrupt_timeout)
|
|
129
|
+
except subprocess.TimeoutExpired:
|
|
130
|
+
logger.info(
|
|
131
|
+
"timed out waiting, sending terminate signal to the process %s", proc.pid
|
|
132
|
+
)
|
|
133
|
+
proc.terminate()
|
|
134
|
+
try:
|
|
135
|
+
return proc.wait(terminate_timeout)
|
|
136
|
+
except subprocess.TimeoutExpired:
|
|
137
|
+
logger.info("timed out waiting, killing the process %s", proc.pid)
|
|
138
|
+
proc.kill()
|
|
139
|
+
return proc.wait()
|
|
140
|
+
|
|
141
|
+
|
|
102
142
|
def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
|
|
103
143
|
buffer = b""
|
|
104
144
|
while byt := stream.read(1): # Read one byte at a time
|
|
@@ -247,7 +287,6 @@ class NodeGroup:
|
|
|
247
287
|
# The source path within the bucket
|
|
248
288
|
# (not including the bucket name or s3:// prefix)
|
|
249
289
|
source_path: str = ""
|
|
250
|
-
is_edatachain: bool = False
|
|
251
290
|
dataset_name: Optional[str] = None
|
|
252
291
|
dataset_version: Optional[int] = None
|
|
253
292
|
instantiated_nodes: Optional[list[NodeWithPath]] = None
|
|
@@ -272,55 +311,11 @@ class NodeGroup:
|
|
|
272
311
|
self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
|
|
273
312
|
|
|
274
313
|
|
|
275
|
-
def check_output_dataset_file(
|
|
276
|
-
output: str,
|
|
277
|
-
force: bool = False,
|
|
278
|
-
dataset_filename: Optional[str] = None,
|
|
279
|
-
skip_check_edatachain: bool = False,
|
|
280
|
-
) -> str:
|
|
281
|
-
"""
|
|
282
|
-
Checks the dataset filename for existence or if it should be force-overwritten.
|
|
283
|
-
"""
|
|
284
|
-
dataset_file = (
|
|
285
|
-
dataset_filename if dataset_filename else output + DATASET_FILE_SUFFIX
|
|
286
|
-
)
|
|
287
|
-
if not skip_check_edatachain and os.path.exists(dataset_file):
|
|
288
|
-
if force:
|
|
289
|
-
os.remove(dataset_file)
|
|
290
|
-
else:
|
|
291
|
-
raise RuntimeError(f"Output dataset file already exists: {dataset_file}")
|
|
292
|
-
return dataset_file
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
def parse_edatachain_file(filename: str) -> list[dict[str, Any]]:
|
|
296
|
-
with open(filename, encoding="utf-8") as f:
|
|
297
|
-
contents = yaml.safe_load(f)
|
|
298
|
-
|
|
299
|
-
if not isinstance(contents, list):
|
|
300
|
-
contents = [contents]
|
|
301
|
-
|
|
302
|
-
for entry in contents:
|
|
303
|
-
if not isinstance(entry, dict):
|
|
304
|
-
raise TypeError(
|
|
305
|
-
"Failed parsing EDataChain file, "
|
|
306
|
-
"each data source entry must be a dictionary"
|
|
307
|
-
)
|
|
308
|
-
if "data-source" not in entry or "files" not in entry:
|
|
309
|
-
raise ValueError(
|
|
310
|
-
"Failed parsing EDataChain file, "
|
|
311
|
-
"each data source entry must contain the "
|
|
312
|
-
'"data-source" and "files" keys'
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
return contents
|
|
316
|
-
|
|
317
|
-
|
|
318
314
|
def prepare_output_for_cp(
|
|
319
315
|
node_groups: list[NodeGroup],
|
|
320
316
|
output: str,
|
|
321
317
|
force: bool = False,
|
|
322
|
-
|
|
323
|
-
no_edatachain_file: bool = False,
|
|
318
|
+
no_cp: bool = False,
|
|
324
319
|
) -> tuple[bool, Optional[str]]:
|
|
325
320
|
total_node_count = 0
|
|
326
321
|
for node_group in node_groups:
|
|
@@ -333,7 +328,7 @@ def prepare_output_for_cp(
|
|
|
333
328
|
always_copy_dir_contents = False
|
|
334
329
|
copy_to_filename = None
|
|
335
330
|
|
|
336
|
-
if
|
|
331
|
+
if no_cp:
|
|
337
332
|
return always_copy_dir_contents, copy_to_filename
|
|
338
333
|
|
|
339
334
|
if not os.path.isdir(output):
|
|
@@ -358,10 +353,6 @@ def prepare_output_for_cp(
|
|
|
358
353
|
copy_to_filename = output
|
|
359
354
|
else:
|
|
360
355
|
raise FileNotFoundError(f"Is not a directory: {output}")
|
|
361
|
-
|
|
362
|
-
if copy_to_filename and not no_edatachain_file:
|
|
363
|
-
raise RuntimeError("File to file cp not supported with .edatachain files!")
|
|
364
|
-
|
|
365
356
|
return always_copy_dir_contents, copy_to_filename
|
|
366
357
|
|
|
367
358
|
|
|
@@ -465,8 +456,6 @@ def instantiate_node_groups(
|
|
|
465
456
|
copy_to_filename,
|
|
466
457
|
recursive,
|
|
467
458
|
copy_dir_contents,
|
|
468
|
-
source_path,
|
|
469
|
-
node_group.is_edatachain,
|
|
470
459
|
node_group.is_dataset,
|
|
471
460
|
)
|
|
472
461
|
if not virtual_only:
|
|
@@ -484,24 +473,6 @@ def instantiate_node_groups(
|
|
|
484
473
|
instantiate_progress_bar.close()
|
|
485
474
|
|
|
486
475
|
|
|
487
|
-
def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
|
|
488
|
-
metafile_data = []
|
|
489
|
-
for node_group in node_groups:
|
|
490
|
-
if not node_group.sources:
|
|
491
|
-
continue
|
|
492
|
-
listing: Listing = node_group.listing
|
|
493
|
-
metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
|
|
494
|
-
for node in node_group.instantiated_nodes:
|
|
495
|
-
if not node.n.is_dir:
|
|
496
|
-
metafile_group["files"].append( # type: ignore [attr-defined]
|
|
497
|
-
node.get_metafile_data()
|
|
498
|
-
)
|
|
499
|
-
if metafile_group["files"]:
|
|
500
|
-
metafile_data.append(metafile_group)
|
|
501
|
-
|
|
502
|
-
return metafile_data
|
|
503
|
-
|
|
504
|
-
|
|
505
476
|
def find_column_to_str( # noqa: PLR0911
|
|
506
477
|
row: tuple[Any, ...], field_lookup: dict[str, int], src: DataSource, column: str
|
|
507
478
|
) -> str:
|
|
@@ -536,7 +507,7 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
536
507
|
return ""
|
|
537
508
|
|
|
538
509
|
|
|
539
|
-
def clone_catalog_with_cache(catalog: "Catalog", cache: "
|
|
510
|
+
def clone_catalog_with_cache(catalog: "Catalog", cache: "Cache") -> "Catalog":
|
|
540
511
|
clone = catalog.copy()
|
|
541
512
|
clone.cache = cache
|
|
542
513
|
return clone
|
|
@@ -559,7 +530,7 @@ class Catalog:
|
|
|
559
530
|
datachain_dir.init()
|
|
560
531
|
self.metastore = metastore
|
|
561
532
|
self._warehouse = warehouse
|
|
562
|
-
self.cache =
|
|
533
|
+
self.cache = Cache(datachain_dir.cache, datachain_dir.tmp)
|
|
563
534
|
self.client_config = client_config if client_config is not None else {}
|
|
564
535
|
self._init_params = {
|
|
565
536
|
"cache_dir": cache_dir,
|
|
@@ -703,22 +674,8 @@ class Catalog:
|
|
|
703
674
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
704
675
|
client_config = client_config or self.client_config
|
|
705
676
|
for src in sources: # Opt: parallel
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
edatachain_data = parse_edatachain_file(src)
|
|
709
|
-
indexed_sources = []
|
|
710
|
-
for ds in edatachain_data:
|
|
711
|
-
listing, _, source_path = self.enlist_source(
|
|
712
|
-
ds["data-source"]["uri"],
|
|
713
|
-
update,
|
|
714
|
-
client_config=client_config,
|
|
715
|
-
)
|
|
716
|
-
paths = datachain_paths_join(
|
|
717
|
-
source_path, (f["name"] for f in ds["files"])
|
|
718
|
-
)
|
|
719
|
-
indexed_sources.append((listing, source_path, paths))
|
|
720
|
-
enlisted_sources.append((True, False, indexed_sources))
|
|
721
|
-
elif src.startswith("ds://"):
|
|
677
|
+
listing: Optional[Listing]
|
|
678
|
+
if src.startswith("ds://"):
|
|
722
679
|
ds_name, ds_version = parse_dataset_uri(src)
|
|
723
680
|
dataset = self.get_dataset(ds_name)
|
|
724
681
|
if not ds_version:
|
|
@@ -796,7 +753,6 @@ class Catalog:
|
|
|
796
753
|
listing.client,
|
|
797
754
|
dsrc,
|
|
798
755
|
source_path,
|
|
799
|
-
is_edatachain=True,
|
|
800
756
|
)
|
|
801
757
|
)
|
|
802
758
|
else:
|
|
@@ -1360,8 +1316,6 @@ class Catalog:
|
|
|
1360
1316
|
local_ds_version: Optional[int] = None,
|
|
1361
1317
|
cp: bool = False,
|
|
1362
1318
|
force: bool = False,
|
|
1363
|
-
edatachain: bool = False,
|
|
1364
|
-
edatachain_file: Optional[str] = None,
|
|
1365
1319
|
*,
|
|
1366
1320
|
client_config=None,
|
|
1367
1321
|
) -> None:
|
|
@@ -1373,8 +1327,6 @@ class Catalog:
|
|
|
1373
1327
|
[ds_uri],
|
|
1374
1328
|
output,
|
|
1375
1329
|
force=force,
|
|
1376
|
-
no_edatachain_file=not edatachain,
|
|
1377
|
-
edatachain_file=edatachain_file,
|
|
1378
1330
|
client_config=client_config,
|
|
1379
1331
|
)
|
|
1380
1332
|
print(f"Dataset {ds_uri} instantiated locally to {output}")
|
|
@@ -1541,8 +1493,6 @@ class Catalog:
|
|
|
1541
1493
|
recursive: bool = False,
|
|
1542
1494
|
no_glob: bool = False,
|
|
1543
1495
|
no_cp: bool = False,
|
|
1544
|
-
edatachain: bool = False,
|
|
1545
|
-
edatachain_file: Optional[str] = None,
|
|
1546
1496
|
*,
|
|
1547
1497
|
client_config=None,
|
|
1548
1498
|
) -> None:
|
|
@@ -1551,9 +1501,8 @@ class Catalog:
|
|
|
1551
1501
|
them into the dataset folder.
|
|
1552
1502
|
It also adds those files to a dataset in database, which is
|
|
1553
1503
|
created if doesn't exist yet
|
|
1554
|
-
Optionally, it creates a .edatachain file
|
|
1555
1504
|
"""
|
|
1556
|
-
if not no_cp
|
|
1505
|
+
if not no_cp:
|
|
1557
1506
|
self.cp(
|
|
1558
1507
|
sources,
|
|
1559
1508
|
output,
|
|
@@ -1561,9 +1510,7 @@ class Catalog:
|
|
|
1561
1510
|
update=update,
|
|
1562
1511
|
recursive=recursive,
|
|
1563
1512
|
no_glob=no_glob,
|
|
1564
|
-
|
|
1565
|
-
no_edatachain_file=not edatachain,
|
|
1566
|
-
edatachain_file=edatachain_file,
|
|
1513
|
+
no_cp=no_cp,
|
|
1567
1514
|
client_config=client_config,
|
|
1568
1515
|
)
|
|
1569
1516
|
else:
|
|
@@ -1588,6 +1535,8 @@ class Catalog:
|
|
|
1588
1535
|
output_hook: Callable[[str], None] = noop,
|
|
1589
1536
|
params: Optional[dict[str, str]] = None,
|
|
1590
1537
|
job_id: Optional[str] = None,
|
|
1538
|
+
interrupt_timeout: Optional[int] = None,
|
|
1539
|
+
terminate_timeout: Optional[int] = None,
|
|
1591
1540
|
) -> None:
|
|
1592
1541
|
cmd = [python_executable, "-c", query_script]
|
|
1593
1542
|
env = dict(env or os.environ)
|
|
@@ -1601,13 +1550,48 @@ class Catalog:
|
|
|
1601
1550
|
if capture_output:
|
|
1602
1551
|
popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
|
|
1603
1552
|
|
|
1553
|
+
def raise_termination_signal(sig: int, _: Any) -> NoReturn:
|
|
1554
|
+
raise TerminationSignal(sig)
|
|
1555
|
+
|
|
1556
|
+
thread: Optional[Thread] = None
|
|
1604
1557
|
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1558
|
+
logger.info("Starting process %s", proc.pid)
|
|
1559
|
+
|
|
1560
|
+
orig_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
1561
|
+
# ignore SIGINT in the main process.
|
|
1562
|
+
# In the terminal, SIGINTs are received by all the processes in
|
|
1563
|
+
# the foreground process group, so the script will receive the signal too.
|
|
1564
|
+
# (If we forward the signal to the child, it will receive it twice.)
|
|
1565
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
1610
1566
|
|
|
1567
|
+
orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
1568
|
+
signal.signal(signal.SIGTERM, raise_termination_signal)
|
|
1569
|
+
try:
|
|
1570
|
+
if capture_output:
|
|
1571
|
+
args = (proc.stdout, output_hook)
|
|
1572
|
+
thread = Thread(target=_process_stream, args=args, daemon=True)
|
|
1573
|
+
thread.start()
|
|
1574
|
+
|
|
1575
|
+
proc.wait()
|
|
1576
|
+
except TerminationSignal as exc:
|
|
1577
|
+
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1578
|
+
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1579
|
+
logging.info("Shutting down process %s, received %r", proc.pid, exc)
|
|
1580
|
+
# Rather than forwarding the signal to the child, we try to shut it down
|
|
1581
|
+
# gracefully. This is because we consider the script to be interactive
|
|
1582
|
+
# and special, so we give it time to cleanup before exiting.
|
|
1583
|
+
shutdown_process(proc, interrupt_timeout, terminate_timeout)
|
|
1584
|
+
if proc.returncode:
|
|
1585
|
+
raise QueryScriptCancelError(
|
|
1586
|
+
"Query script was canceled by user", return_code=proc.returncode
|
|
1587
|
+
) from exc
|
|
1588
|
+
finally:
|
|
1589
|
+
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1590
|
+
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1591
|
+
if thread:
|
|
1592
|
+
thread.join() # wait for the reader thread
|
|
1593
|
+
|
|
1594
|
+
logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1611
1595
|
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1612
1596
|
raise QueryScriptCancelError(
|
|
1613
1597
|
"Query script was canceled by user",
|
|
@@ -1626,17 +1610,14 @@ class Catalog:
|
|
|
1626
1610
|
force: bool = False,
|
|
1627
1611
|
update: bool = False,
|
|
1628
1612
|
recursive: bool = False,
|
|
1629
|
-
|
|
1630
|
-
edatachain_only: bool = False,
|
|
1631
|
-
no_edatachain_file: bool = False,
|
|
1613
|
+
no_cp: bool = False,
|
|
1632
1614
|
no_glob: bool = False,
|
|
1633
1615
|
*,
|
|
1634
|
-
client_config=None,
|
|
1635
|
-
) ->
|
|
1616
|
+
client_config: Optional["dict"] = None,
|
|
1617
|
+
) -> None:
|
|
1636
1618
|
"""
|
|
1637
1619
|
This function copies files from cloud sources to local destination directory
|
|
1638
1620
|
If cloud source is not indexed, or has expired index, it runs indexing
|
|
1639
|
-
It also creates .edatachain file by default, if not specified differently
|
|
1640
1621
|
"""
|
|
1641
1622
|
client_config = client_config or self.client_config
|
|
1642
1623
|
node_groups = self.enlist_sources_grouped(
|
|
@@ -1647,17 +1628,11 @@ class Catalog:
|
|
|
1647
1628
|
)
|
|
1648
1629
|
|
|
1649
1630
|
always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
|
|
1650
|
-
node_groups, output, force,
|
|
1631
|
+
node_groups, output, force, no_cp
|
|
1651
1632
|
)
|
|
1652
|
-
dataset_file = check_output_dataset_file(
|
|
1653
|
-
output, force, edatachain_file, no_edatachain_file
|
|
1654
|
-
)
|
|
1655
|
-
|
|
1656
1633
|
total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
# Nothing selected to cp
|
|
1660
|
-
return []
|
|
1634
|
+
if not total_files:
|
|
1635
|
+
return
|
|
1661
1636
|
|
|
1662
1637
|
desc_max_len = max(len(output) + 16, 19)
|
|
1663
1638
|
bar_format = (
|
|
@@ -1667,7 +1642,7 @@ class Catalog:
|
|
|
1667
1642
|
"[{elapsed}<{remaining}, {rate_fmt:>8}]"
|
|
1668
1643
|
)
|
|
1669
1644
|
|
|
1670
|
-
if not
|
|
1645
|
+
if not no_cp:
|
|
1671
1646
|
with get_download_bar(bar_format, total_size) as pbar:
|
|
1672
1647
|
for node_group in node_groups:
|
|
1673
1648
|
node_group.download(recursive=recursive, pbar=pbar)
|
|
@@ -1679,21 +1654,10 @@ class Catalog:
|
|
|
1679
1654
|
total_files,
|
|
1680
1655
|
force,
|
|
1681
1656
|
recursive,
|
|
1682
|
-
|
|
1657
|
+
no_cp,
|
|
1683
1658
|
always_copy_dir_contents,
|
|
1684
1659
|
copy_to_filename,
|
|
1685
1660
|
)
|
|
1686
|
-
if no_edatachain_file:
|
|
1687
|
-
return []
|
|
1688
|
-
|
|
1689
|
-
metafile_data = compute_metafile_data(node_groups)
|
|
1690
|
-
if metafile_data:
|
|
1691
|
-
# Don't write the metafile if nothing was copied
|
|
1692
|
-
print(f"Creating '{dataset_file}'")
|
|
1693
|
-
with open(dataset_file, "w", encoding="utf-8") as fd:
|
|
1694
|
-
yaml.dump(metafile_data, fd, sort_keys=False)
|
|
1695
|
-
|
|
1696
|
-
return metafile_data
|
|
1697
1661
|
|
|
1698
1662
|
def du(
|
|
1699
1663
|
self,
|
datachain/cli/__init__.py
CHANGED
|
@@ -47,9 +47,13 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
47
47
|
logging_level = get_logging_level(args)
|
|
48
48
|
logger.setLevel(logging_level)
|
|
49
49
|
|
|
50
|
-
client_config =
|
|
51
|
-
|
|
52
|
-
|
|
50
|
+
client_config = (
|
|
51
|
+
{
|
|
52
|
+
"anon": args.anon,
|
|
53
|
+
}
|
|
54
|
+
if getattr(args, "anon", False)
|
|
55
|
+
else {}
|
|
56
|
+
)
|
|
53
57
|
|
|
54
58
|
if args.debug_sql:
|
|
55
59
|
# This also sets this environment variable for any subprocesses
|
|
@@ -107,9 +111,6 @@ def handle_cp_command(args, catalog):
|
|
|
107
111
|
force=bool(args.force),
|
|
108
112
|
update=bool(args.update),
|
|
109
113
|
recursive=bool(args.recursive),
|
|
110
|
-
edatachain_file=None,
|
|
111
|
-
edatachain_only=False,
|
|
112
|
-
no_edatachain_file=True,
|
|
113
114
|
no_glob=args.no_glob,
|
|
114
115
|
)
|
|
115
116
|
|
datachain/cli/parser/__init__.py
CHANGED
|
@@ -8,7 +8,14 @@ from datachain.cli.utils import BooleanOptionalAction, KeyValueArgs
|
|
|
8
8
|
|
|
9
9
|
from .job import add_jobs_parser
|
|
10
10
|
from .studio import add_auth_parser
|
|
11
|
-
from .utils import
|
|
11
|
+
from .utils import (
|
|
12
|
+
FIND_COLUMNS,
|
|
13
|
+
add_anon_arg,
|
|
14
|
+
add_show_args,
|
|
15
|
+
add_sources_arg,
|
|
16
|
+
add_update_arg,
|
|
17
|
+
find_columns_type,
|
|
18
|
+
)
|
|
12
19
|
|
|
13
20
|
|
|
14
21
|
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
@@ -32,19 +39,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
32
39
|
"-q", "--quiet", action="count", default=0, help="Be quiet"
|
|
33
40
|
)
|
|
34
41
|
|
|
35
|
-
parent_parser.add_argument(
|
|
36
|
-
"--anon",
|
|
37
|
-
action="store_true",
|
|
38
|
-
help="Use anonymous access to storage",
|
|
39
|
-
)
|
|
40
|
-
parent_parser.add_argument(
|
|
41
|
-
"-u",
|
|
42
|
-
"--update",
|
|
43
|
-
action="count",
|
|
44
|
-
default=0,
|
|
45
|
-
help="Update cached list of files for the sources",
|
|
46
|
-
)
|
|
47
|
-
|
|
48
42
|
parent_parser.add_argument(
|
|
49
43
|
"--debug-sql",
|
|
50
44
|
action="store_true",
|
|
@@ -92,6 +86,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
92
86
|
action="store_true",
|
|
93
87
|
help="Do not expand globs (such as * or ?)",
|
|
94
88
|
)
|
|
89
|
+
add_anon_arg(parse_cp)
|
|
90
|
+
add_update_arg(parse_cp)
|
|
95
91
|
|
|
96
92
|
parse_clone = subp.add_parser(
|
|
97
93
|
"clone", parents=[parent_parser], description="Copy data files from the cloud."
|
|
@@ -127,6 +123,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
127
123
|
action="store_true",
|
|
128
124
|
help="Do not copy files, just create a dataset",
|
|
129
125
|
)
|
|
126
|
+
add_anon_arg(parse_clone)
|
|
127
|
+
add_update_arg(parse_clone)
|
|
130
128
|
|
|
131
129
|
add_auth_parser(subp, parent_parser)
|
|
132
130
|
add_jobs_parser(subp, parent_parser)
|
|
@@ -137,6 +135,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
137
135
|
parents=[parent_parser],
|
|
138
136
|
description="Commands for managing datasets.",
|
|
139
137
|
)
|
|
138
|
+
add_anon_arg(datasets_parser)
|
|
140
139
|
datasets_subparser = datasets_parser.add_subparsers(
|
|
141
140
|
dest="datasets_cmd",
|
|
142
141
|
help="Use `datachain dataset CMD --help` to display command-specific help",
|
|
@@ -336,6 +335,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
336
335
|
parse_ls = subp.add_parser(
|
|
337
336
|
"ls", parents=[parent_parser], description="List storage contents."
|
|
338
337
|
)
|
|
338
|
+
add_anon_arg(parse_ls)
|
|
339
|
+
add_update_arg(parse_ls)
|
|
339
340
|
add_sources_arg(parse_ls, nargs="*")
|
|
340
341
|
parse_ls.add_argument(
|
|
341
342
|
"-l",
|
|
@@ -375,6 +376,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
375
376
|
"du", parents=[parent_parser], description="Display space usage."
|
|
376
377
|
)
|
|
377
378
|
add_sources_arg(parse_du)
|
|
379
|
+
add_anon_arg(parse_du)
|
|
380
|
+
add_update_arg(parse_du)
|
|
378
381
|
parse_du.add_argument(
|
|
379
382
|
"-b",
|
|
380
383
|
"--bytes",
|
|
@@ -404,6 +407,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
404
407
|
parse_find = subp.add_parser(
|
|
405
408
|
"find", parents=[parent_parser], description="Search in a directory hierarchy."
|
|
406
409
|
)
|
|
410
|
+
add_anon_arg(parse_find)
|
|
411
|
+
add_update_arg(parse_find)
|
|
407
412
|
add_sources_arg(parse_find)
|
|
408
413
|
parse_find.add_argument(
|
|
409
414
|
"--name",
|
|
@@ -457,6 +462,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
457
462
|
parse_index = subp.add_parser(
|
|
458
463
|
"index", parents=[parent_parser], description="Index storage location."
|
|
459
464
|
)
|
|
465
|
+
add_anon_arg(parse_index)
|
|
466
|
+
add_update_arg(parse_index)
|
|
460
467
|
add_sources_arg(parse_index)
|
|
461
468
|
|
|
462
469
|
show_parser = subp.add_parser(
|
|
@@ -480,6 +487,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
480
487
|
parents=[parent_parser],
|
|
481
488
|
description="Create a new dataset with a query script.",
|
|
482
489
|
)
|
|
490
|
+
add_anon_arg(query_parser)
|
|
483
491
|
query_parser.add_argument(
|
|
484
492
|
"script", metavar="<script.py>", type=str, help="Filepath for script"
|
|
485
493
|
)
|
|
@@ -504,14 +512,17 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
504
512
|
help="Query parameters",
|
|
505
513
|
)
|
|
506
514
|
|
|
507
|
-
subp.add_parser(
|
|
515
|
+
parse_clear_cache = subp.add_parser(
|
|
508
516
|
"clear-cache",
|
|
509
517
|
parents=[parent_parser],
|
|
510
518
|
description="Clear the local file cache.",
|
|
511
519
|
)
|
|
512
|
-
|
|
520
|
+
add_anon_arg(parse_clear_cache)
|
|
521
|
+
|
|
522
|
+
parse_gc = subp.add_parser(
|
|
513
523
|
"gc", parents=[parent_parser], description="Garbage collect temporary tables."
|
|
514
524
|
)
|
|
525
|
+
add_anon_arg(parse_gc)
|
|
515
526
|
|
|
516
527
|
subp.add_parser("internal-run-udf", parents=[parent_parser])
|
|
517
528
|
subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
|
datachain/cli/parser/studio.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
def add_auth_parser(subparsers, parent_parser) -> None:
|
|
2
|
+
from dvc_studio_client.auth import AVAILABLE_SCOPES
|
|
3
|
+
|
|
2
4
|
auth_help = "Manage Studio authentication"
|
|
3
|
-
auth_description =
|
|
4
|
-
"Manage authentication and settings for Studio. "
|
|
5
|
-
"Configure tokens for sharing datasets and using Studio features."
|
|
6
|
-
)
|
|
5
|
+
auth_description = "Manage authentication and settings for Studio. "
|
|
7
6
|
|
|
8
7
|
auth_parser = subparsers.add_parser(
|
|
9
8
|
"auth",
|
|
@@ -19,8 +18,10 @@ def add_auth_parser(subparsers, parent_parser) -> None:
|
|
|
19
18
|
auth_login_help = "Authenticate with Studio"
|
|
20
19
|
auth_login_description = (
|
|
21
20
|
"Authenticate with Studio using default scopes. "
|
|
22
|
-
"A random name will be assigned
|
|
21
|
+
"A random name will be assigned if the token name is not specified."
|
|
23
22
|
)
|
|
23
|
+
|
|
24
|
+
allowed_scopes = ", ".join(AVAILABLE_SCOPES)
|
|
24
25
|
login_parser = auth_subparser.add_parser(
|
|
25
26
|
"login",
|
|
26
27
|
parents=[parent_parser],
|
|
@@ -40,7 +41,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
|
|
|
40
41
|
"--scopes",
|
|
41
42
|
action="store",
|
|
42
43
|
default=None,
|
|
43
|
-
help="Authentication token scopes",
|
|
44
|
+
help=f"Authentication token scopes. Allowed scopes: {allowed_scopes}",
|
|
44
45
|
)
|
|
45
46
|
|
|
46
47
|
login_parser.add_argument(
|
datachain/cli/parser/utils.py
CHANGED
|
@@ -34,6 +34,24 @@ def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Act
|
|
|
34
34
|
)
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
def add_anon_arg(parser: ArgumentParser) -> None:
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--anon",
|
|
40
|
+
action="store_true",
|
|
41
|
+
help="Use anonymous access to storage",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def add_update_arg(parser: ArgumentParser) -> None:
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"-u",
|
|
48
|
+
"--update",
|
|
49
|
+
action="count",
|
|
50
|
+
default=0,
|
|
51
|
+
help="Update cached list of files for the sources",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
37
55
|
def add_show_args(parser: ArgumentParser) -> None:
|
|
38
56
|
parser.add_argument(
|
|
39
57
|
"--limit",
|