datachain 0.8.9__py3-none-any.whl → 0.8.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +4 -4
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +102 -138
- datachain/cli/__init__.py +9 -9
- datachain/cli/parser/__init__.py +36 -20
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/studio.py +35 -34
- datachain/cli/parser/utils.py +19 -1
- datachain/cli/utils.py +1 -1
- datachain/client/fsspec.py +11 -8
- datachain/client/local.py +4 -4
- datachain/data_storage/schema.py +1 -1
- datachain/data_storage/sqlite.py +38 -7
- datachain/data_storage/warehouse.py +2 -2
- datachain/dataset.py +1 -1
- datachain/error.py +12 -0
- datachain/func/__init__.py +2 -1
- datachain/func/conditional.py +67 -23
- datachain/func/func.py +17 -5
- datachain/lib/convert/python_to_sql.py +15 -3
- datachain/lib/dc.py +27 -5
- datachain/lib/file.py +16 -0
- datachain/lib/listing.py +30 -12
- datachain/lib/pytorch.py +1 -1
- datachain/lib/udf.py +1 -1
- datachain/listing.py +1 -13
- datachain/node.py +0 -15
- datachain/nodes_fetcher.py +2 -2
- datachain/query/dataset.py +8 -4
- datachain/remote/studio.py +3 -3
- datachain/sql/sqlite/base.py +35 -14
- datachain/studio.py +8 -8
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/METADATA +3 -7
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/RECORD +38 -38
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/LICENSE +0 -0
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/WHEEL +0 -0
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/top_level.txt +0 -0
datachain/cache.py
CHANGED
|
@@ -22,15 +22,15 @@ def try_scandir(path):
|
|
|
22
22
|
pass
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "
|
|
25
|
+
def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
|
|
26
26
|
cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
|
|
27
|
-
return
|
|
27
|
+
return Cache(cache_dir, tmp_dir=tmp_dir)
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
@contextmanager
|
|
31
31
|
def temporary_cache(
|
|
32
32
|
tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
|
|
33
|
-
) -> Iterator["
|
|
33
|
+
) -> Iterator["Cache"]:
|
|
34
34
|
cache = get_temp_cache(tmp_dir, prefix=prefix)
|
|
35
35
|
try:
|
|
36
36
|
yield cache
|
|
@@ -39,7 +39,7 @@ def temporary_cache(
|
|
|
39
39
|
cache.destroy()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class
|
|
42
|
+
class Cache:
|
|
43
43
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
44
44
|
self.odb = LocalHashFileDB(
|
|
45
45
|
LocalFileSystem(),
|
datachain/catalog/__init__.py
CHANGED
|
@@ -3,7 +3,6 @@ from .catalog import (
|
|
|
3
3
|
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
4
4
|
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
|
|
5
5
|
Catalog,
|
|
6
|
-
parse_edatachain_file,
|
|
7
6
|
)
|
|
8
7
|
from .loader import get_catalog
|
|
9
8
|
|
|
@@ -13,5 +12,4 @@ __all__ = [
|
|
|
13
12
|
"QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
|
|
14
13
|
"Catalog",
|
|
15
14
|
"get_catalog",
|
|
16
|
-
"parse_edatachain_file",
|
|
17
15
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import os.path
|
|
6
6
|
import posixpath
|
|
7
|
+
import signal
|
|
7
8
|
import subprocess
|
|
8
9
|
import sys
|
|
9
10
|
import time
|
|
@@ -26,11 +27,10 @@ from uuid import uuid4
|
|
|
26
27
|
|
|
27
28
|
import requests
|
|
28
29
|
import sqlalchemy as sa
|
|
29
|
-
import yaml
|
|
30
30
|
from sqlalchemy import Column
|
|
31
31
|
from tqdm.auto import tqdm
|
|
32
32
|
|
|
33
|
-
from datachain.cache import
|
|
33
|
+
from datachain.cache import Cache
|
|
34
34
|
from datachain.client import Client
|
|
35
35
|
from datachain.dataset import (
|
|
36
36
|
DATASET_PREFIX,
|
|
@@ -57,7 +57,7 @@ from datachain.node import DirType, Node, NodeWithPath
|
|
|
57
57
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
58
58
|
from datachain.remote.studio import StudioClient
|
|
59
59
|
from datachain.sql.types import DateTime, SQLType
|
|
60
|
-
from datachain.utils import DataChainDir
|
|
60
|
+
from datachain.utils import DataChainDir
|
|
61
61
|
|
|
62
62
|
from .datasource import DataSource
|
|
63
63
|
|
|
@@ -73,7 +73,6 @@ if TYPE_CHECKING:
|
|
|
73
73
|
logger = logging.getLogger("datachain")
|
|
74
74
|
|
|
75
75
|
DEFAULT_DATASET_DIR = "dataset"
|
|
76
|
-
DATASET_FILE_SUFFIX = ".edatachain"
|
|
77
76
|
|
|
78
77
|
TTL_INT = 4 * 60 * 60
|
|
79
78
|
|
|
@@ -99,6 +98,47 @@ def noop(_: str):
|
|
|
99
98
|
pass
|
|
100
99
|
|
|
101
100
|
|
|
101
|
+
class TerminationSignal(RuntimeError): # noqa: N818
|
|
102
|
+
def __init__(self, signal):
|
|
103
|
+
self.signal = signal
|
|
104
|
+
super().__init__("Received termination signal", signal)
|
|
105
|
+
|
|
106
|
+
def __repr__(self):
|
|
107
|
+
return f"{self.__class__.__name__}({self.signal})"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
if sys.platform == "win32":
|
|
111
|
+
SIGINT = signal.CTRL_C_EVENT
|
|
112
|
+
else:
|
|
113
|
+
SIGINT = signal.SIGINT
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def shutdown_process(
|
|
117
|
+
proc: subprocess.Popen,
|
|
118
|
+
interrupt_timeout: Optional[int] = None,
|
|
119
|
+
terminate_timeout: Optional[int] = None,
|
|
120
|
+
) -> int:
|
|
121
|
+
"""Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
|
|
122
|
+
|
|
123
|
+
logger.info("sending interrupt signal to the process %s", proc.pid)
|
|
124
|
+
proc.send_signal(SIGINT)
|
|
125
|
+
|
|
126
|
+
logger.info("waiting for the process %s to finish", proc.pid)
|
|
127
|
+
try:
|
|
128
|
+
return proc.wait(interrupt_timeout)
|
|
129
|
+
except subprocess.TimeoutExpired:
|
|
130
|
+
logger.info(
|
|
131
|
+
"timed out waiting, sending terminate signal to the process %s", proc.pid
|
|
132
|
+
)
|
|
133
|
+
proc.terminate()
|
|
134
|
+
try:
|
|
135
|
+
return proc.wait(terminate_timeout)
|
|
136
|
+
except subprocess.TimeoutExpired:
|
|
137
|
+
logger.info("timed out waiting, killing the process %s", proc.pid)
|
|
138
|
+
proc.kill()
|
|
139
|
+
return proc.wait()
|
|
140
|
+
|
|
141
|
+
|
|
102
142
|
def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
|
|
103
143
|
buffer = b""
|
|
104
144
|
while byt := stream.read(1): # Read one byte at a time
|
|
@@ -247,7 +287,6 @@ class NodeGroup:
|
|
|
247
287
|
# The source path within the bucket
|
|
248
288
|
# (not including the bucket name or s3:// prefix)
|
|
249
289
|
source_path: str = ""
|
|
250
|
-
is_edatachain: bool = False
|
|
251
290
|
dataset_name: Optional[str] = None
|
|
252
291
|
dataset_version: Optional[int] = None
|
|
253
292
|
instantiated_nodes: Optional[list[NodeWithPath]] = None
|
|
@@ -272,55 +311,11 @@ class NodeGroup:
|
|
|
272
311
|
self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
|
|
273
312
|
|
|
274
313
|
|
|
275
|
-
def check_output_dataset_file(
|
|
276
|
-
output: str,
|
|
277
|
-
force: bool = False,
|
|
278
|
-
dataset_filename: Optional[str] = None,
|
|
279
|
-
skip_check_edatachain: bool = False,
|
|
280
|
-
) -> str:
|
|
281
|
-
"""
|
|
282
|
-
Checks the dataset filename for existence or if it should be force-overwritten.
|
|
283
|
-
"""
|
|
284
|
-
dataset_file = (
|
|
285
|
-
dataset_filename if dataset_filename else output + DATASET_FILE_SUFFIX
|
|
286
|
-
)
|
|
287
|
-
if not skip_check_edatachain and os.path.exists(dataset_file):
|
|
288
|
-
if force:
|
|
289
|
-
os.remove(dataset_file)
|
|
290
|
-
else:
|
|
291
|
-
raise RuntimeError(f"Output dataset file already exists: {dataset_file}")
|
|
292
|
-
return dataset_file
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
def parse_edatachain_file(filename: str) -> list[dict[str, Any]]:
|
|
296
|
-
with open(filename, encoding="utf-8") as f:
|
|
297
|
-
contents = yaml.safe_load(f)
|
|
298
|
-
|
|
299
|
-
if not isinstance(contents, list):
|
|
300
|
-
contents = [contents]
|
|
301
|
-
|
|
302
|
-
for entry in contents:
|
|
303
|
-
if not isinstance(entry, dict):
|
|
304
|
-
raise TypeError(
|
|
305
|
-
"Failed parsing EDataChain file, "
|
|
306
|
-
"each data source entry must be a dictionary"
|
|
307
|
-
)
|
|
308
|
-
if "data-source" not in entry or "files" not in entry:
|
|
309
|
-
raise ValueError(
|
|
310
|
-
"Failed parsing EDataChain file, "
|
|
311
|
-
"each data source entry must contain the "
|
|
312
|
-
'"data-source" and "files" keys'
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
return contents
|
|
316
|
-
|
|
317
|
-
|
|
318
314
|
def prepare_output_for_cp(
|
|
319
315
|
node_groups: list[NodeGroup],
|
|
320
316
|
output: str,
|
|
321
317
|
force: bool = False,
|
|
322
|
-
|
|
323
|
-
no_edatachain_file: bool = False,
|
|
318
|
+
no_cp: bool = False,
|
|
324
319
|
) -> tuple[bool, Optional[str]]:
|
|
325
320
|
total_node_count = 0
|
|
326
321
|
for node_group in node_groups:
|
|
@@ -333,7 +328,7 @@ def prepare_output_for_cp(
|
|
|
333
328
|
always_copy_dir_contents = False
|
|
334
329
|
copy_to_filename = None
|
|
335
330
|
|
|
336
|
-
if
|
|
331
|
+
if no_cp:
|
|
337
332
|
return always_copy_dir_contents, copy_to_filename
|
|
338
333
|
|
|
339
334
|
if not os.path.isdir(output):
|
|
@@ -358,10 +353,6 @@ def prepare_output_for_cp(
|
|
|
358
353
|
copy_to_filename = output
|
|
359
354
|
else:
|
|
360
355
|
raise FileNotFoundError(f"Is not a directory: {output}")
|
|
361
|
-
|
|
362
|
-
if copy_to_filename and not no_edatachain_file:
|
|
363
|
-
raise RuntimeError("File to file cp not supported with .edatachain files!")
|
|
364
|
-
|
|
365
356
|
return always_copy_dir_contents, copy_to_filename
|
|
366
357
|
|
|
367
358
|
|
|
@@ -465,8 +456,6 @@ def instantiate_node_groups(
|
|
|
465
456
|
copy_to_filename,
|
|
466
457
|
recursive,
|
|
467
458
|
copy_dir_contents,
|
|
468
|
-
source_path,
|
|
469
|
-
node_group.is_edatachain,
|
|
470
459
|
node_group.is_dataset,
|
|
471
460
|
)
|
|
472
461
|
if not virtual_only:
|
|
@@ -484,24 +473,6 @@ def instantiate_node_groups(
|
|
|
484
473
|
instantiate_progress_bar.close()
|
|
485
474
|
|
|
486
475
|
|
|
487
|
-
def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
|
|
488
|
-
metafile_data = []
|
|
489
|
-
for node_group in node_groups:
|
|
490
|
-
if not node_group.sources:
|
|
491
|
-
continue
|
|
492
|
-
listing: Listing = node_group.listing
|
|
493
|
-
metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
|
|
494
|
-
for node in node_group.instantiated_nodes:
|
|
495
|
-
if not node.n.is_dir:
|
|
496
|
-
metafile_group["files"].append( # type: ignore [attr-defined]
|
|
497
|
-
node.get_metafile_data()
|
|
498
|
-
)
|
|
499
|
-
if metafile_group["files"]:
|
|
500
|
-
metafile_data.append(metafile_group)
|
|
501
|
-
|
|
502
|
-
return metafile_data
|
|
503
|
-
|
|
504
|
-
|
|
505
476
|
def find_column_to_str( # noqa: PLR0911
|
|
506
477
|
row: tuple[Any, ...], field_lookup: dict[str, int], src: DataSource, column: str
|
|
507
478
|
) -> str:
|
|
@@ -536,7 +507,7 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
536
507
|
return ""
|
|
537
508
|
|
|
538
509
|
|
|
539
|
-
def clone_catalog_with_cache(catalog: "Catalog", cache: "
|
|
510
|
+
def clone_catalog_with_cache(catalog: "Catalog", cache: "Cache") -> "Catalog":
|
|
540
511
|
clone = catalog.copy()
|
|
541
512
|
clone.cache = cache
|
|
542
513
|
return clone
|
|
@@ -559,7 +530,7 @@ class Catalog:
|
|
|
559
530
|
datachain_dir.init()
|
|
560
531
|
self.metastore = metastore
|
|
561
532
|
self._warehouse = warehouse
|
|
562
|
-
self.cache =
|
|
533
|
+
self.cache = Cache(datachain_dir.cache, datachain_dir.tmp)
|
|
563
534
|
self.client_config = client_config if client_config is not None else {}
|
|
564
535
|
self._init_params = {
|
|
565
536
|
"cache_dir": cache_dir,
|
|
@@ -703,22 +674,8 @@ class Catalog:
|
|
|
703
674
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
704
675
|
client_config = client_config or self.client_config
|
|
705
676
|
for src in sources: # Opt: parallel
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
edatachain_data = parse_edatachain_file(src)
|
|
709
|
-
indexed_sources = []
|
|
710
|
-
for ds in edatachain_data:
|
|
711
|
-
listing, _, source_path = self.enlist_source(
|
|
712
|
-
ds["data-source"]["uri"],
|
|
713
|
-
update,
|
|
714
|
-
client_config=client_config,
|
|
715
|
-
)
|
|
716
|
-
paths = datachain_paths_join(
|
|
717
|
-
source_path, (f["name"] for f in ds["files"])
|
|
718
|
-
)
|
|
719
|
-
indexed_sources.append((listing, source_path, paths))
|
|
720
|
-
enlisted_sources.append((True, False, indexed_sources))
|
|
721
|
-
elif src.startswith("ds://"):
|
|
677
|
+
listing: Optional[Listing]
|
|
678
|
+
if src.startswith("ds://"):
|
|
722
679
|
ds_name, ds_version = parse_dataset_uri(src)
|
|
723
680
|
dataset = self.get_dataset(ds_name)
|
|
724
681
|
if not ds_version:
|
|
@@ -796,7 +753,6 @@ class Catalog:
|
|
|
796
753
|
listing.client,
|
|
797
754
|
dsrc,
|
|
798
755
|
source_path,
|
|
799
|
-
is_edatachain=True,
|
|
800
756
|
)
|
|
801
757
|
)
|
|
802
758
|
else:
|
|
@@ -1360,8 +1316,6 @@ class Catalog:
|
|
|
1360
1316
|
local_ds_version: Optional[int] = None,
|
|
1361
1317
|
cp: bool = False,
|
|
1362
1318
|
force: bool = False,
|
|
1363
|
-
edatachain: bool = False,
|
|
1364
|
-
edatachain_file: Optional[str] = None,
|
|
1365
1319
|
*,
|
|
1366
1320
|
client_config=None,
|
|
1367
1321
|
) -> None:
|
|
@@ -1373,8 +1327,6 @@ class Catalog:
|
|
|
1373
1327
|
[ds_uri],
|
|
1374
1328
|
output,
|
|
1375
1329
|
force=force,
|
|
1376
|
-
no_edatachain_file=not edatachain,
|
|
1377
|
-
edatachain_file=edatachain_file,
|
|
1378
1330
|
client_config=client_config,
|
|
1379
1331
|
)
|
|
1380
1332
|
print(f"Dataset {ds_uri} instantiated locally to {output}")
|
|
@@ -1541,8 +1493,6 @@ class Catalog:
|
|
|
1541
1493
|
recursive: bool = False,
|
|
1542
1494
|
no_glob: bool = False,
|
|
1543
1495
|
no_cp: bool = False,
|
|
1544
|
-
edatachain: bool = False,
|
|
1545
|
-
edatachain_file: Optional[str] = None,
|
|
1546
1496
|
*,
|
|
1547
1497
|
client_config=None,
|
|
1548
1498
|
) -> None:
|
|
@@ -1551,9 +1501,8 @@ class Catalog:
|
|
|
1551
1501
|
them into the dataset folder.
|
|
1552
1502
|
It also adds those files to a dataset in database, which is
|
|
1553
1503
|
created if doesn't exist yet
|
|
1554
|
-
Optionally, it creates a .edatachain file
|
|
1555
1504
|
"""
|
|
1556
|
-
if not no_cp
|
|
1505
|
+
if not no_cp:
|
|
1557
1506
|
self.cp(
|
|
1558
1507
|
sources,
|
|
1559
1508
|
output,
|
|
@@ -1561,9 +1510,7 @@ class Catalog:
|
|
|
1561
1510
|
update=update,
|
|
1562
1511
|
recursive=recursive,
|
|
1563
1512
|
no_glob=no_glob,
|
|
1564
|
-
|
|
1565
|
-
no_edatachain_file=not edatachain,
|
|
1566
|
-
edatachain_file=edatachain_file,
|
|
1513
|
+
no_cp=no_cp,
|
|
1567
1514
|
client_config=client_config,
|
|
1568
1515
|
)
|
|
1569
1516
|
else:
|
|
@@ -1588,6 +1535,8 @@ class Catalog:
|
|
|
1588
1535
|
output_hook: Callable[[str], None] = noop,
|
|
1589
1536
|
params: Optional[dict[str, str]] = None,
|
|
1590
1537
|
job_id: Optional[str] = None,
|
|
1538
|
+
interrupt_timeout: Optional[int] = None,
|
|
1539
|
+
terminate_timeout: Optional[int] = None,
|
|
1591
1540
|
) -> None:
|
|
1592
1541
|
cmd = [python_executable, "-c", query_script]
|
|
1593
1542
|
env = dict(env or os.environ)
|
|
@@ -1601,13 +1550,48 @@ class Catalog:
|
|
|
1601
1550
|
if capture_output:
|
|
1602
1551
|
popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
|
|
1603
1552
|
|
|
1553
|
+
def raise_termination_signal(sig: int, _: Any) -> NoReturn:
|
|
1554
|
+
raise TerminationSignal(sig)
|
|
1555
|
+
|
|
1556
|
+
thread: Optional[Thread] = None
|
|
1604
1557
|
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1558
|
+
logger.info("Starting process %s", proc.pid)
|
|
1559
|
+
|
|
1560
|
+
orig_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
1561
|
+
# ignore SIGINT in the main process.
|
|
1562
|
+
# In the terminal, SIGINTs are received by all the processes in
|
|
1563
|
+
# the foreground process group, so the script will receive the signal too.
|
|
1564
|
+
# (If we forward the signal to the child, it will receive it twice.)
|
|
1565
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
1610
1566
|
|
|
1567
|
+
orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
1568
|
+
signal.signal(signal.SIGTERM, raise_termination_signal)
|
|
1569
|
+
try:
|
|
1570
|
+
if capture_output:
|
|
1571
|
+
args = (proc.stdout, output_hook)
|
|
1572
|
+
thread = Thread(target=_process_stream, args=args, daemon=True)
|
|
1573
|
+
thread.start()
|
|
1574
|
+
|
|
1575
|
+
proc.wait()
|
|
1576
|
+
except TerminationSignal as exc:
|
|
1577
|
+
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1578
|
+
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1579
|
+
logging.info("Shutting down process %s, received %r", proc.pid, exc)
|
|
1580
|
+
# Rather than forwarding the signal to the child, we try to shut it down
|
|
1581
|
+
# gracefully. This is because we consider the script to be interactive
|
|
1582
|
+
# and special, so we give it time to cleanup before exiting.
|
|
1583
|
+
shutdown_process(proc, interrupt_timeout, terminate_timeout)
|
|
1584
|
+
if proc.returncode:
|
|
1585
|
+
raise QueryScriptCancelError(
|
|
1586
|
+
"Query script was canceled by user", return_code=proc.returncode
|
|
1587
|
+
) from exc
|
|
1588
|
+
finally:
|
|
1589
|
+
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1590
|
+
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1591
|
+
if thread:
|
|
1592
|
+
thread.join() # wait for the reader thread
|
|
1593
|
+
|
|
1594
|
+
logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1611
1595
|
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1612
1596
|
raise QueryScriptCancelError(
|
|
1613
1597
|
"Query script was canceled by user",
|
|
@@ -1626,17 +1610,14 @@ class Catalog:
|
|
|
1626
1610
|
force: bool = False,
|
|
1627
1611
|
update: bool = False,
|
|
1628
1612
|
recursive: bool = False,
|
|
1629
|
-
|
|
1630
|
-
edatachain_only: bool = False,
|
|
1631
|
-
no_edatachain_file: bool = False,
|
|
1613
|
+
no_cp: bool = False,
|
|
1632
1614
|
no_glob: bool = False,
|
|
1633
1615
|
*,
|
|
1634
|
-
client_config=None,
|
|
1635
|
-
) ->
|
|
1616
|
+
client_config: Optional["dict"] = None,
|
|
1617
|
+
) -> None:
|
|
1636
1618
|
"""
|
|
1637
1619
|
This function copies files from cloud sources to local destination directory
|
|
1638
1620
|
If cloud source is not indexed, or has expired index, it runs indexing
|
|
1639
|
-
It also creates .edatachain file by default, if not specified differently
|
|
1640
1621
|
"""
|
|
1641
1622
|
client_config = client_config or self.client_config
|
|
1642
1623
|
node_groups = self.enlist_sources_grouped(
|
|
@@ -1647,17 +1628,11 @@ class Catalog:
|
|
|
1647
1628
|
)
|
|
1648
1629
|
|
|
1649
1630
|
always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
|
|
1650
|
-
node_groups, output, force,
|
|
1631
|
+
node_groups, output, force, no_cp
|
|
1651
1632
|
)
|
|
1652
|
-
dataset_file = check_output_dataset_file(
|
|
1653
|
-
output, force, edatachain_file, no_edatachain_file
|
|
1654
|
-
)
|
|
1655
|
-
|
|
1656
1633
|
total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
# Nothing selected to cp
|
|
1660
|
-
return []
|
|
1634
|
+
if not total_files:
|
|
1635
|
+
return
|
|
1661
1636
|
|
|
1662
1637
|
desc_max_len = max(len(output) + 16, 19)
|
|
1663
1638
|
bar_format = (
|
|
@@ -1667,7 +1642,7 @@ class Catalog:
|
|
|
1667
1642
|
"[{elapsed}<{remaining}, {rate_fmt:>8}]"
|
|
1668
1643
|
)
|
|
1669
1644
|
|
|
1670
|
-
if not
|
|
1645
|
+
if not no_cp:
|
|
1671
1646
|
with get_download_bar(bar_format, total_size) as pbar:
|
|
1672
1647
|
for node_group in node_groups:
|
|
1673
1648
|
node_group.download(recursive=recursive, pbar=pbar)
|
|
@@ -1679,21 +1654,10 @@ class Catalog:
|
|
|
1679
1654
|
total_files,
|
|
1680
1655
|
force,
|
|
1681
1656
|
recursive,
|
|
1682
|
-
|
|
1657
|
+
no_cp,
|
|
1683
1658
|
always_copy_dir_contents,
|
|
1684
1659
|
copy_to_filename,
|
|
1685
1660
|
)
|
|
1686
|
-
if no_edatachain_file:
|
|
1687
|
-
return []
|
|
1688
|
-
|
|
1689
|
-
metafile_data = compute_metafile_data(node_groups)
|
|
1690
|
-
if metafile_data:
|
|
1691
|
-
# Don't write the metafile if nothing was copied
|
|
1692
|
-
print(f"Creating '{dataset_file}'")
|
|
1693
|
-
with open(dataset_file, "w", encoding="utf-8") as fd:
|
|
1694
|
-
yaml.dump(metafile_data, fd, sort_keys=False)
|
|
1695
|
-
|
|
1696
|
-
return metafile_data
|
|
1697
1661
|
|
|
1698
1662
|
def du(
|
|
1699
1663
|
self,
|
datachain/cli/__init__.py
CHANGED
|
@@ -47,10 +47,13 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
47
47
|
logging_level = get_logging_level(args)
|
|
48
48
|
logger.setLevel(logging_level)
|
|
49
49
|
|
|
50
|
-
client_config =
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
50
|
+
client_config = (
|
|
51
|
+
{
|
|
52
|
+
"anon": args.anon,
|
|
53
|
+
}
|
|
54
|
+
if getattr(args, "anon", False)
|
|
55
|
+
else {}
|
|
56
|
+
)
|
|
54
57
|
|
|
55
58
|
if args.debug_sql:
|
|
56
59
|
# This also sets this environment variable for any subprocesses
|
|
@@ -73,7 +76,7 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
73
76
|
|
|
74
77
|
def handle_command(args, catalog, client_config) -> int:
|
|
75
78
|
"""Handle the different CLI commands."""
|
|
76
|
-
from datachain.studio import
|
|
79
|
+
from datachain.studio import process_auth_cli_args, process_jobs_args
|
|
77
80
|
|
|
78
81
|
command_handlers = {
|
|
79
82
|
"cp": lambda: handle_cp_command(args, catalog),
|
|
@@ -89,7 +92,7 @@ def handle_command(args, catalog, client_config) -> int:
|
|
|
89
92
|
"query": lambda: handle_query_command(args, catalog),
|
|
90
93
|
"clear-cache": lambda: clear_cache(catalog),
|
|
91
94
|
"gc": lambda: garbage_collect(catalog),
|
|
92
|
-
"
|
|
95
|
+
"auth": lambda: process_auth_cli_args(args),
|
|
93
96
|
"job": lambda: process_jobs_args(args),
|
|
94
97
|
}
|
|
95
98
|
|
|
@@ -108,9 +111,6 @@ def handle_cp_command(args, catalog):
|
|
|
108
111
|
force=bool(args.force),
|
|
109
112
|
update=bool(args.update),
|
|
110
113
|
recursive=bool(args.recursive),
|
|
111
|
-
edatachain_file=None,
|
|
112
|
-
edatachain_only=False,
|
|
113
|
-
no_edatachain_file=True,
|
|
114
114
|
no_glob=args.no_glob,
|
|
115
115
|
)
|
|
116
116
|
|
datachain/cli/parser/__init__.py
CHANGED
|
@@ -7,8 +7,15 @@ import shtab
|
|
|
7
7
|
from datachain.cli.utils import BooleanOptionalAction, KeyValueArgs
|
|
8
8
|
|
|
9
9
|
from .job import add_jobs_parser
|
|
10
|
-
from .studio import
|
|
11
|
-
from .utils import
|
|
10
|
+
from .studio import add_auth_parser
|
|
11
|
+
from .utils import (
|
|
12
|
+
FIND_COLUMNS,
|
|
13
|
+
add_anon_arg,
|
|
14
|
+
add_show_args,
|
|
15
|
+
add_sources_arg,
|
|
16
|
+
add_update_arg,
|
|
17
|
+
find_columns_type,
|
|
18
|
+
)
|
|
12
19
|
|
|
13
20
|
|
|
14
21
|
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
@@ -25,25 +32,13 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
25
32
|
parser.add_argument("-V", "--version", action="version", version=__version__)
|
|
26
33
|
|
|
27
34
|
parent_parser = ArgumentParser(add_help=False)
|
|
28
|
-
parent_parser.add_argument(
|
|
29
|
-
"--aws-endpoint-url",
|
|
30
|
-
type=str,
|
|
31
|
-
help="AWS endpoint URL",
|
|
32
|
-
)
|
|
33
|
-
parent_parser.add_argument(
|
|
34
|
-
"--anon",
|
|
35
|
-
action="store_true",
|
|
36
|
-
help="anon flag for remote storage (like awscli's --no-sign-request)",
|
|
37
|
-
)
|
|
38
|
-
parent_parser.add_argument(
|
|
39
|
-
"-u", "--update", action="count", default=0, help="Update cache"
|
|
40
|
-
)
|
|
41
35
|
parent_parser.add_argument(
|
|
42
36
|
"-v", "--verbose", action="count", default=0, help="Be verbose"
|
|
43
37
|
)
|
|
44
38
|
parent_parser.add_argument(
|
|
45
39
|
"-q", "--quiet", action="count", default=0, help="Be quiet"
|
|
46
40
|
)
|
|
41
|
+
|
|
47
42
|
parent_parser.add_argument(
|
|
48
43
|
"--debug-sql",
|
|
49
44
|
action="store_true",
|
|
@@ -67,7 +62,9 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
67
62
|
"cp", parents=[parent_parser], description="Copy data files from the cloud."
|
|
68
63
|
)
|
|
69
64
|
add_sources_arg(parse_cp).complete = shtab.DIR # type: ignore[attr-defined]
|
|
70
|
-
parse_cp.add_argument(
|
|
65
|
+
parse_cp.add_argument(
|
|
66
|
+
"output", type=str, help="Path to a directory or file to put data to"
|
|
67
|
+
)
|
|
71
68
|
parse_cp.add_argument(
|
|
72
69
|
"-f",
|
|
73
70
|
"--force",
|
|
@@ -89,12 +86,16 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
89
86
|
action="store_true",
|
|
90
87
|
help="Do not expand globs (such as * or ?)",
|
|
91
88
|
)
|
|
89
|
+
add_anon_arg(parse_cp)
|
|
90
|
+
add_update_arg(parse_cp)
|
|
92
91
|
|
|
93
92
|
parse_clone = subp.add_parser(
|
|
94
93
|
"clone", parents=[parent_parser], description="Copy data files from the cloud."
|
|
95
94
|
)
|
|
96
95
|
add_sources_arg(parse_clone).complete = shtab.DIR # type: ignore[attr-defined]
|
|
97
|
-
parse_clone.add_argument(
|
|
96
|
+
parse_clone.add_argument(
|
|
97
|
+
"output", type=str, help="Path to a directory or file to put data to"
|
|
98
|
+
)
|
|
98
99
|
parse_clone.add_argument(
|
|
99
100
|
"-f",
|
|
100
101
|
"--force",
|
|
@@ -122,8 +123,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
122
123
|
action="store_true",
|
|
123
124
|
help="Do not copy files, just create a dataset",
|
|
124
125
|
)
|
|
126
|
+
add_anon_arg(parse_clone)
|
|
127
|
+
add_update_arg(parse_clone)
|
|
125
128
|
|
|
126
|
-
|
|
129
|
+
add_auth_parser(subp, parent_parser)
|
|
127
130
|
add_jobs_parser(subp, parent_parser)
|
|
128
131
|
|
|
129
132
|
datasets_parser = subp.add_parser(
|
|
@@ -132,6 +135,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
132
135
|
parents=[parent_parser],
|
|
133
136
|
description="Commands for managing datasets.",
|
|
134
137
|
)
|
|
138
|
+
add_anon_arg(datasets_parser)
|
|
135
139
|
datasets_subparser = datasets_parser.add_subparsers(
|
|
136
140
|
dest="datasets_cmd",
|
|
137
141
|
help="Use `datachain dataset CMD --help` to display command-specific help",
|
|
@@ -331,6 +335,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
331
335
|
parse_ls = subp.add_parser(
|
|
332
336
|
"ls", parents=[parent_parser], description="List storage contents."
|
|
333
337
|
)
|
|
338
|
+
add_anon_arg(parse_ls)
|
|
339
|
+
add_update_arg(parse_ls)
|
|
334
340
|
add_sources_arg(parse_ls, nargs="*")
|
|
335
341
|
parse_ls.add_argument(
|
|
336
342
|
"-l",
|
|
@@ -370,6 +376,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
370
376
|
"du", parents=[parent_parser], description="Display space usage."
|
|
371
377
|
)
|
|
372
378
|
add_sources_arg(parse_du)
|
|
379
|
+
add_anon_arg(parse_du)
|
|
380
|
+
add_update_arg(parse_du)
|
|
373
381
|
parse_du.add_argument(
|
|
374
382
|
"-b",
|
|
375
383
|
"--bytes",
|
|
@@ -399,6 +407,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
399
407
|
parse_find = subp.add_parser(
|
|
400
408
|
"find", parents=[parent_parser], description="Search in a directory hierarchy."
|
|
401
409
|
)
|
|
410
|
+
add_anon_arg(parse_find)
|
|
411
|
+
add_update_arg(parse_find)
|
|
402
412
|
add_sources_arg(parse_find)
|
|
403
413
|
parse_find.add_argument(
|
|
404
414
|
"--name",
|
|
@@ -452,6 +462,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
452
462
|
parse_index = subp.add_parser(
|
|
453
463
|
"index", parents=[parent_parser], description="Index storage location."
|
|
454
464
|
)
|
|
465
|
+
add_anon_arg(parse_index)
|
|
466
|
+
add_update_arg(parse_index)
|
|
455
467
|
add_sources_arg(parse_index)
|
|
456
468
|
|
|
457
469
|
show_parser = subp.add_parser(
|
|
@@ -475,6 +487,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
475
487
|
parents=[parent_parser],
|
|
476
488
|
description="Create a new dataset with a query script.",
|
|
477
489
|
)
|
|
490
|
+
add_anon_arg(query_parser)
|
|
478
491
|
query_parser.add_argument(
|
|
479
492
|
"script", metavar="<script.py>", type=str, help="Filepath for script"
|
|
480
493
|
)
|
|
@@ -499,14 +512,17 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
499
512
|
help="Query parameters",
|
|
500
513
|
)
|
|
501
514
|
|
|
502
|
-
subp.add_parser(
|
|
515
|
+
parse_clear_cache = subp.add_parser(
|
|
503
516
|
"clear-cache",
|
|
504
517
|
parents=[parent_parser],
|
|
505
518
|
description="Clear the local file cache.",
|
|
506
519
|
)
|
|
507
|
-
|
|
520
|
+
add_anon_arg(parse_clear_cache)
|
|
521
|
+
|
|
522
|
+
parse_gc = subp.add_parser(
|
|
508
523
|
"gc", parents=[parent_parser], description="Garbage collect temporary tables."
|
|
509
524
|
)
|
|
525
|
+
add_anon_arg(parse_gc)
|
|
510
526
|
|
|
511
527
|
subp.add_parser("internal-run-udf", parents=[parent_parser])
|
|
512
528
|
subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
|
datachain/cli/parser/job.py
CHANGED
|
@@ -6,7 +6,7 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
6
6
|
)
|
|
7
7
|
jobs_subparser = jobs_parser.add_subparsers(
|
|
8
8
|
dest="cmd",
|
|
9
|
-
help="Use `datachain
|
|
9
|
+
help="Use `datachain auth CMD --help` to display command-specific help",
|
|
10
10
|
)
|
|
11
11
|
|
|
12
12
|
studio_run_help = "Run a job in Studio"
|