datachain 0.8.10__py3-none-any.whl → 0.8.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +4 -4
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +103 -158
- datachain/cli/__init__.py +7 -14
- datachain/cli/commands/__init__.py +0 -2
- datachain/cli/commands/datasets.py +0 -19
- datachain/cli/parser/__init__.py +27 -41
- datachain/cli/parser/studio.py +7 -6
- datachain/cli/parser/utils.py +18 -0
- datachain/client/fsspec.py +11 -8
- datachain/client/local.py +4 -4
- datachain/data_storage/schema.py +1 -1
- datachain/dataset.py +1 -7
- datachain/error.py +12 -0
- datachain/func/__init__.py +2 -1
- datachain/func/conditional.py +77 -26
- datachain/func/func.py +17 -6
- datachain/lib/dc.py +24 -4
- datachain/lib/file.py +16 -0
- datachain/lib/listing.py +30 -12
- datachain/lib/pytorch.py +1 -1
- datachain/lib/udf.py +1 -1
- datachain/listing.py +1 -13
- datachain/node.py +0 -15
- datachain/nodes_fetcher.py +2 -2
- datachain/remote/studio.py +2 -14
- datachain/studio.py +1 -1
- {datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/METADATA +3 -7
- {datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/RECORD +33 -33
- {datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/LICENSE +0 -0
- {datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/WHEEL +0 -0
- {datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/top_level.txt +0 -0
datachain/cache.py
CHANGED
|
@@ -22,15 +22,15 @@ def try_scandir(path):
|
|
|
22
22
|
pass
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "
|
|
25
|
+
def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
|
|
26
26
|
cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
|
|
27
|
-
return
|
|
27
|
+
return Cache(cache_dir, tmp_dir=tmp_dir)
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
@contextmanager
|
|
31
31
|
def temporary_cache(
|
|
32
32
|
tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
|
|
33
|
-
) -> Iterator["
|
|
33
|
+
) -> Iterator["Cache"]:
|
|
34
34
|
cache = get_temp_cache(tmp_dir, prefix=prefix)
|
|
35
35
|
try:
|
|
36
36
|
yield cache
|
|
@@ -39,7 +39,7 @@ def temporary_cache(
|
|
|
39
39
|
cache.destroy()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class
|
|
42
|
+
class Cache:
|
|
43
43
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
44
44
|
self.odb = LocalHashFileDB(
|
|
45
45
|
LocalFileSystem(),
|
datachain/catalog/__init__.py
CHANGED
|
@@ -3,7 +3,6 @@ from .catalog import (
|
|
|
3
3
|
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
4
4
|
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
|
|
5
5
|
Catalog,
|
|
6
|
-
parse_edatachain_file,
|
|
7
6
|
)
|
|
8
7
|
from .loader import get_catalog
|
|
9
8
|
|
|
@@ -13,5 +12,4 @@ __all__ = [
|
|
|
13
12
|
"QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
|
|
14
13
|
"Catalog",
|
|
15
14
|
"get_catalog",
|
|
16
|
-
"parse_edatachain_file",
|
|
17
15
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import os.path
|
|
6
6
|
import posixpath
|
|
7
|
+
import signal
|
|
7
8
|
import subprocess
|
|
8
9
|
import sys
|
|
9
10
|
import time
|
|
@@ -26,11 +27,10 @@ from uuid import uuid4
|
|
|
26
27
|
|
|
27
28
|
import requests
|
|
28
29
|
import sqlalchemy as sa
|
|
29
|
-
import yaml
|
|
30
30
|
from sqlalchemy import Column
|
|
31
31
|
from tqdm.auto import tqdm
|
|
32
32
|
|
|
33
|
-
from datachain.cache import
|
|
33
|
+
from datachain.cache import Cache
|
|
34
34
|
from datachain.client import Client
|
|
35
35
|
from datachain.dataset import (
|
|
36
36
|
DATASET_PREFIX,
|
|
@@ -38,7 +38,6 @@ from datachain.dataset import (
|
|
|
38
38
|
DatasetDependency,
|
|
39
39
|
DatasetListRecord,
|
|
40
40
|
DatasetRecord,
|
|
41
|
-
DatasetStats,
|
|
42
41
|
DatasetStatus,
|
|
43
42
|
StorageURI,
|
|
44
43
|
create_dataset_uri,
|
|
@@ -57,7 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
|
|
|
57
56
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
58
57
|
from datachain.remote.studio import StudioClient
|
|
59
58
|
from datachain.sql.types import DateTime, SQLType
|
|
60
|
-
from datachain.utils import DataChainDir
|
|
59
|
+
from datachain.utils import DataChainDir
|
|
61
60
|
|
|
62
61
|
from .datasource import DataSource
|
|
63
62
|
|
|
@@ -73,7 +72,6 @@ if TYPE_CHECKING:
|
|
|
73
72
|
logger = logging.getLogger("datachain")
|
|
74
73
|
|
|
75
74
|
DEFAULT_DATASET_DIR = "dataset"
|
|
76
|
-
DATASET_FILE_SUFFIX = ".edatachain"
|
|
77
75
|
|
|
78
76
|
TTL_INT = 4 * 60 * 60
|
|
79
77
|
|
|
@@ -99,6 +97,47 @@ def noop(_: str):
|
|
|
99
97
|
pass
|
|
100
98
|
|
|
101
99
|
|
|
100
|
+
class TerminationSignal(RuntimeError): # noqa: N818
|
|
101
|
+
def __init__(self, signal):
|
|
102
|
+
self.signal = signal
|
|
103
|
+
super().__init__("Received termination signal", signal)
|
|
104
|
+
|
|
105
|
+
def __repr__(self):
|
|
106
|
+
return f"{self.__class__.__name__}({self.signal})"
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
if sys.platform == "win32":
|
|
110
|
+
SIGINT = signal.CTRL_C_EVENT
|
|
111
|
+
else:
|
|
112
|
+
SIGINT = signal.SIGINT
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def shutdown_process(
|
|
116
|
+
proc: subprocess.Popen,
|
|
117
|
+
interrupt_timeout: Optional[int] = None,
|
|
118
|
+
terminate_timeout: Optional[int] = None,
|
|
119
|
+
) -> int:
|
|
120
|
+
"""Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
|
|
121
|
+
|
|
122
|
+
logger.info("sending interrupt signal to the process %s", proc.pid)
|
|
123
|
+
proc.send_signal(SIGINT)
|
|
124
|
+
|
|
125
|
+
logger.info("waiting for the process %s to finish", proc.pid)
|
|
126
|
+
try:
|
|
127
|
+
return proc.wait(interrupt_timeout)
|
|
128
|
+
except subprocess.TimeoutExpired:
|
|
129
|
+
logger.info(
|
|
130
|
+
"timed out waiting, sending terminate signal to the process %s", proc.pid
|
|
131
|
+
)
|
|
132
|
+
proc.terminate()
|
|
133
|
+
try:
|
|
134
|
+
return proc.wait(terminate_timeout)
|
|
135
|
+
except subprocess.TimeoutExpired:
|
|
136
|
+
logger.info("timed out waiting, killing the process %s", proc.pid)
|
|
137
|
+
proc.kill()
|
|
138
|
+
return proc.wait()
|
|
139
|
+
|
|
140
|
+
|
|
102
141
|
def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
|
|
103
142
|
buffer = b""
|
|
104
143
|
while byt := stream.read(1): # Read one byte at a time
|
|
@@ -247,7 +286,6 @@ class NodeGroup:
|
|
|
247
286
|
# The source path within the bucket
|
|
248
287
|
# (not including the bucket name or s3:// prefix)
|
|
249
288
|
source_path: str = ""
|
|
250
|
-
is_edatachain: bool = False
|
|
251
289
|
dataset_name: Optional[str] = None
|
|
252
290
|
dataset_version: Optional[int] = None
|
|
253
291
|
instantiated_nodes: Optional[list[NodeWithPath]] = None
|
|
@@ -272,55 +310,11 @@ class NodeGroup:
|
|
|
272
310
|
self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
|
|
273
311
|
|
|
274
312
|
|
|
275
|
-
def check_output_dataset_file(
|
|
276
|
-
output: str,
|
|
277
|
-
force: bool = False,
|
|
278
|
-
dataset_filename: Optional[str] = None,
|
|
279
|
-
skip_check_edatachain: bool = False,
|
|
280
|
-
) -> str:
|
|
281
|
-
"""
|
|
282
|
-
Checks the dataset filename for existence or if it should be force-overwritten.
|
|
283
|
-
"""
|
|
284
|
-
dataset_file = (
|
|
285
|
-
dataset_filename if dataset_filename else output + DATASET_FILE_SUFFIX
|
|
286
|
-
)
|
|
287
|
-
if not skip_check_edatachain and os.path.exists(dataset_file):
|
|
288
|
-
if force:
|
|
289
|
-
os.remove(dataset_file)
|
|
290
|
-
else:
|
|
291
|
-
raise RuntimeError(f"Output dataset file already exists: {dataset_file}")
|
|
292
|
-
return dataset_file
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
def parse_edatachain_file(filename: str) -> list[dict[str, Any]]:
|
|
296
|
-
with open(filename, encoding="utf-8") as f:
|
|
297
|
-
contents = yaml.safe_load(f)
|
|
298
|
-
|
|
299
|
-
if not isinstance(contents, list):
|
|
300
|
-
contents = [contents]
|
|
301
|
-
|
|
302
|
-
for entry in contents:
|
|
303
|
-
if not isinstance(entry, dict):
|
|
304
|
-
raise TypeError(
|
|
305
|
-
"Failed parsing EDataChain file, "
|
|
306
|
-
"each data source entry must be a dictionary"
|
|
307
|
-
)
|
|
308
|
-
if "data-source" not in entry or "files" not in entry:
|
|
309
|
-
raise ValueError(
|
|
310
|
-
"Failed parsing EDataChain file, "
|
|
311
|
-
"each data source entry must contain the "
|
|
312
|
-
'"data-source" and "files" keys'
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
return contents
|
|
316
|
-
|
|
317
|
-
|
|
318
313
|
def prepare_output_for_cp(
|
|
319
314
|
node_groups: list[NodeGroup],
|
|
320
315
|
output: str,
|
|
321
316
|
force: bool = False,
|
|
322
|
-
|
|
323
|
-
no_edatachain_file: bool = False,
|
|
317
|
+
no_cp: bool = False,
|
|
324
318
|
) -> tuple[bool, Optional[str]]:
|
|
325
319
|
total_node_count = 0
|
|
326
320
|
for node_group in node_groups:
|
|
@@ -333,7 +327,7 @@ def prepare_output_for_cp(
|
|
|
333
327
|
always_copy_dir_contents = False
|
|
334
328
|
copy_to_filename = None
|
|
335
329
|
|
|
336
|
-
if
|
|
330
|
+
if no_cp:
|
|
337
331
|
return always_copy_dir_contents, copy_to_filename
|
|
338
332
|
|
|
339
333
|
if not os.path.isdir(output):
|
|
@@ -358,10 +352,6 @@ def prepare_output_for_cp(
|
|
|
358
352
|
copy_to_filename = output
|
|
359
353
|
else:
|
|
360
354
|
raise FileNotFoundError(f"Is not a directory: {output}")
|
|
361
|
-
|
|
362
|
-
if copy_to_filename and not no_edatachain_file:
|
|
363
|
-
raise RuntimeError("File to file cp not supported with .edatachain files!")
|
|
364
|
-
|
|
365
355
|
return always_copy_dir_contents, copy_to_filename
|
|
366
356
|
|
|
367
357
|
|
|
@@ -465,8 +455,6 @@ def instantiate_node_groups(
|
|
|
465
455
|
copy_to_filename,
|
|
466
456
|
recursive,
|
|
467
457
|
copy_dir_contents,
|
|
468
|
-
source_path,
|
|
469
|
-
node_group.is_edatachain,
|
|
470
458
|
node_group.is_dataset,
|
|
471
459
|
)
|
|
472
460
|
if not virtual_only:
|
|
@@ -484,24 +472,6 @@ def instantiate_node_groups(
|
|
|
484
472
|
instantiate_progress_bar.close()
|
|
485
473
|
|
|
486
474
|
|
|
487
|
-
def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
|
|
488
|
-
metafile_data = []
|
|
489
|
-
for node_group in node_groups:
|
|
490
|
-
if not node_group.sources:
|
|
491
|
-
continue
|
|
492
|
-
listing: Listing = node_group.listing
|
|
493
|
-
metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
|
|
494
|
-
for node in node_group.instantiated_nodes:
|
|
495
|
-
if not node.n.is_dir:
|
|
496
|
-
metafile_group["files"].append( # type: ignore [attr-defined]
|
|
497
|
-
node.get_metafile_data()
|
|
498
|
-
)
|
|
499
|
-
if metafile_group["files"]:
|
|
500
|
-
metafile_data.append(metafile_group)
|
|
501
|
-
|
|
502
|
-
return metafile_data
|
|
503
|
-
|
|
504
|
-
|
|
505
475
|
def find_column_to_str( # noqa: PLR0911
|
|
506
476
|
row: tuple[Any, ...], field_lookup: dict[str, int], src: DataSource, column: str
|
|
507
477
|
) -> str:
|
|
@@ -536,7 +506,7 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
536
506
|
return ""
|
|
537
507
|
|
|
538
508
|
|
|
539
|
-
def clone_catalog_with_cache(catalog: "Catalog", cache: "
|
|
509
|
+
def clone_catalog_with_cache(catalog: "Catalog", cache: "Cache") -> "Catalog":
|
|
540
510
|
clone = catalog.copy()
|
|
541
511
|
clone.cache = cache
|
|
542
512
|
return clone
|
|
@@ -559,7 +529,7 @@ class Catalog:
|
|
|
559
529
|
datachain_dir.init()
|
|
560
530
|
self.metastore = metastore
|
|
561
531
|
self._warehouse = warehouse
|
|
562
|
-
self.cache =
|
|
532
|
+
self.cache = Cache(datachain_dir.cache, datachain_dir.tmp)
|
|
563
533
|
self.client_config = client_config if client_config is not None else {}
|
|
564
534
|
self._init_params = {
|
|
565
535
|
"cache_dir": cache_dir,
|
|
@@ -703,22 +673,8 @@ class Catalog:
|
|
|
703
673
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
704
674
|
client_config = client_config or self.client_config
|
|
705
675
|
for src in sources: # Opt: parallel
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
edatachain_data = parse_edatachain_file(src)
|
|
709
|
-
indexed_sources = []
|
|
710
|
-
for ds in edatachain_data:
|
|
711
|
-
listing, _, source_path = self.enlist_source(
|
|
712
|
-
ds["data-source"]["uri"],
|
|
713
|
-
update,
|
|
714
|
-
client_config=client_config,
|
|
715
|
-
)
|
|
716
|
-
paths = datachain_paths_join(
|
|
717
|
-
source_path, (f["name"] for f in ds["files"])
|
|
718
|
-
)
|
|
719
|
-
indexed_sources.append((listing, source_path, paths))
|
|
720
|
-
enlisted_sources.append((True, False, indexed_sources))
|
|
721
|
-
elif src.startswith("ds://"):
|
|
676
|
+
listing: Optional[Listing]
|
|
677
|
+
if src.startswith("ds://"):
|
|
722
678
|
ds_name, ds_version = parse_dataset_uri(src)
|
|
723
679
|
dataset = self.get_dataset(ds_name)
|
|
724
680
|
if not ds_version:
|
|
@@ -796,7 +752,6 @@ class Catalog:
|
|
|
796
752
|
listing.client,
|
|
797
753
|
dsrc,
|
|
798
754
|
source_path,
|
|
799
|
-
is_edatachain=True,
|
|
800
755
|
)
|
|
801
756
|
)
|
|
802
757
|
else:
|
|
@@ -1279,17 +1234,6 @@ class Catalog:
|
|
|
1279
1234
|
dataset = self.get_dataset(name)
|
|
1280
1235
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1281
1236
|
|
|
1282
|
-
def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
|
|
1283
|
-
"""
|
|
1284
|
-
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
1285
|
-
"""
|
|
1286
|
-
dataset = self.get_dataset(name)
|
|
1287
|
-
dataset_version = dataset.get_version(version or dataset.latest_version)
|
|
1288
|
-
return DatasetStats(
|
|
1289
|
-
num_objects=dataset_version.num_objects,
|
|
1290
|
-
size=dataset_version.size,
|
|
1291
|
-
)
|
|
1292
|
-
|
|
1293
1237
|
def remove_dataset(
|
|
1294
1238
|
self,
|
|
1295
1239
|
name: str,
|
|
@@ -1360,8 +1304,6 @@ class Catalog:
|
|
|
1360
1304
|
local_ds_version: Optional[int] = None,
|
|
1361
1305
|
cp: bool = False,
|
|
1362
1306
|
force: bool = False,
|
|
1363
|
-
edatachain: bool = False,
|
|
1364
|
-
edatachain_file: Optional[str] = None,
|
|
1365
1307
|
*,
|
|
1366
1308
|
client_config=None,
|
|
1367
1309
|
) -> None:
|
|
@@ -1373,8 +1315,6 @@ class Catalog:
|
|
|
1373
1315
|
[ds_uri],
|
|
1374
1316
|
output,
|
|
1375
1317
|
force=force,
|
|
1376
|
-
no_edatachain_file=not edatachain,
|
|
1377
|
-
edatachain_file=edatachain_file,
|
|
1378
1318
|
client_config=client_config,
|
|
1379
1319
|
)
|
|
1380
1320
|
print(f"Dataset {ds_uri} instantiated locally to {output}")
|
|
@@ -1439,19 +1379,12 @@ class Catalog:
|
|
|
1439
1379
|
except DatasetNotFoundError:
|
|
1440
1380
|
pass
|
|
1441
1381
|
|
|
1442
|
-
stats_response = studio_client.dataset_stats(
|
|
1443
|
-
remote_ds_name, remote_ds_version.version
|
|
1444
|
-
)
|
|
1445
|
-
if not stats_response.ok:
|
|
1446
|
-
raise_remote_error(stats_response.message)
|
|
1447
|
-
ds_stats = stats_response.data
|
|
1448
|
-
|
|
1449
1382
|
dataset_save_progress_bar = tqdm(
|
|
1450
1383
|
desc=f"Saving dataset {remote_ds_uri} locally: ",
|
|
1451
1384
|
unit=" rows",
|
|
1452
1385
|
unit_scale=True,
|
|
1453
1386
|
unit_divisor=1000,
|
|
1454
|
-
total=
|
|
1387
|
+
total=remote_ds_version.num_objects, # type: ignore [union-attr]
|
|
1455
1388
|
leave=False,
|
|
1456
1389
|
)
|
|
1457
1390
|
|
|
@@ -1541,8 +1474,6 @@ class Catalog:
|
|
|
1541
1474
|
recursive: bool = False,
|
|
1542
1475
|
no_glob: bool = False,
|
|
1543
1476
|
no_cp: bool = False,
|
|
1544
|
-
edatachain: bool = False,
|
|
1545
|
-
edatachain_file: Optional[str] = None,
|
|
1546
1477
|
*,
|
|
1547
1478
|
client_config=None,
|
|
1548
1479
|
) -> None:
|
|
@@ -1551,9 +1482,8 @@ class Catalog:
|
|
|
1551
1482
|
them into the dataset folder.
|
|
1552
1483
|
It also adds those files to a dataset in database, which is
|
|
1553
1484
|
created if doesn't exist yet
|
|
1554
|
-
Optionally, it creates a .edatachain file
|
|
1555
1485
|
"""
|
|
1556
|
-
if not no_cp
|
|
1486
|
+
if not no_cp:
|
|
1557
1487
|
self.cp(
|
|
1558
1488
|
sources,
|
|
1559
1489
|
output,
|
|
@@ -1561,9 +1491,7 @@ class Catalog:
|
|
|
1561
1491
|
update=update,
|
|
1562
1492
|
recursive=recursive,
|
|
1563
1493
|
no_glob=no_glob,
|
|
1564
|
-
|
|
1565
|
-
no_edatachain_file=not edatachain,
|
|
1566
|
-
edatachain_file=edatachain_file,
|
|
1494
|
+
no_cp=no_cp,
|
|
1567
1495
|
client_config=client_config,
|
|
1568
1496
|
)
|
|
1569
1497
|
else:
|
|
@@ -1588,6 +1516,8 @@ class Catalog:
|
|
|
1588
1516
|
output_hook: Callable[[str], None] = noop,
|
|
1589
1517
|
params: Optional[dict[str, str]] = None,
|
|
1590
1518
|
job_id: Optional[str] = None,
|
|
1519
|
+
interrupt_timeout: Optional[int] = None,
|
|
1520
|
+
terminate_timeout: Optional[int] = None,
|
|
1591
1521
|
) -> None:
|
|
1592
1522
|
cmd = [python_executable, "-c", query_script]
|
|
1593
1523
|
env = dict(env or os.environ)
|
|
@@ -1601,13 +1531,48 @@ class Catalog:
|
|
|
1601
1531
|
if capture_output:
|
|
1602
1532
|
popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
|
|
1603
1533
|
|
|
1534
|
+
def raise_termination_signal(sig: int, _: Any) -> NoReturn:
|
|
1535
|
+
raise TerminationSignal(sig)
|
|
1536
|
+
|
|
1537
|
+
thread: Optional[Thread] = None
|
|
1604
1538
|
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1539
|
+
logger.info("Starting process %s", proc.pid)
|
|
1540
|
+
|
|
1541
|
+
orig_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
1542
|
+
# ignore SIGINT in the main process.
|
|
1543
|
+
# In the terminal, SIGINTs are received by all the processes in
|
|
1544
|
+
# the foreground process group, so the script will receive the signal too.
|
|
1545
|
+
# (If we forward the signal to the child, it will receive it twice.)
|
|
1546
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
1610
1547
|
|
|
1548
|
+
orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
1549
|
+
signal.signal(signal.SIGTERM, raise_termination_signal)
|
|
1550
|
+
try:
|
|
1551
|
+
if capture_output:
|
|
1552
|
+
args = (proc.stdout, output_hook)
|
|
1553
|
+
thread = Thread(target=_process_stream, args=args, daemon=True)
|
|
1554
|
+
thread.start()
|
|
1555
|
+
|
|
1556
|
+
proc.wait()
|
|
1557
|
+
except TerminationSignal as exc:
|
|
1558
|
+
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1559
|
+
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1560
|
+
logging.info("Shutting down process %s, received %r", proc.pid, exc)
|
|
1561
|
+
# Rather than forwarding the signal to the child, we try to shut it down
|
|
1562
|
+
# gracefully. This is because we consider the script to be interactive
|
|
1563
|
+
# and special, so we give it time to cleanup before exiting.
|
|
1564
|
+
shutdown_process(proc, interrupt_timeout, terminate_timeout)
|
|
1565
|
+
if proc.returncode:
|
|
1566
|
+
raise QueryScriptCancelError(
|
|
1567
|
+
"Query script was canceled by user", return_code=proc.returncode
|
|
1568
|
+
) from exc
|
|
1569
|
+
finally:
|
|
1570
|
+
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1571
|
+
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1572
|
+
if thread:
|
|
1573
|
+
thread.join() # wait for the reader thread
|
|
1574
|
+
|
|
1575
|
+
logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1611
1576
|
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1612
1577
|
raise QueryScriptCancelError(
|
|
1613
1578
|
"Query script was canceled by user",
|
|
@@ -1626,17 +1591,14 @@ class Catalog:
|
|
|
1626
1591
|
force: bool = False,
|
|
1627
1592
|
update: bool = False,
|
|
1628
1593
|
recursive: bool = False,
|
|
1629
|
-
|
|
1630
|
-
edatachain_only: bool = False,
|
|
1631
|
-
no_edatachain_file: bool = False,
|
|
1594
|
+
no_cp: bool = False,
|
|
1632
1595
|
no_glob: bool = False,
|
|
1633
1596
|
*,
|
|
1634
|
-
client_config=None,
|
|
1635
|
-
) ->
|
|
1597
|
+
client_config: Optional["dict"] = None,
|
|
1598
|
+
) -> None:
|
|
1636
1599
|
"""
|
|
1637
1600
|
This function copies files from cloud sources to local destination directory
|
|
1638
1601
|
If cloud source is not indexed, or has expired index, it runs indexing
|
|
1639
|
-
It also creates .edatachain file by default, if not specified differently
|
|
1640
1602
|
"""
|
|
1641
1603
|
client_config = client_config or self.client_config
|
|
1642
1604
|
node_groups = self.enlist_sources_grouped(
|
|
@@ -1647,17 +1609,11 @@ class Catalog:
|
|
|
1647
1609
|
)
|
|
1648
1610
|
|
|
1649
1611
|
always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
|
|
1650
|
-
node_groups, output, force,
|
|
1651
|
-
)
|
|
1652
|
-
dataset_file = check_output_dataset_file(
|
|
1653
|
-
output, force, edatachain_file, no_edatachain_file
|
|
1612
|
+
node_groups, output, force, no_cp
|
|
1654
1613
|
)
|
|
1655
|
-
|
|
1656
1614
|
total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
# Nothing selected to cp
|
|
1660
|
-
return []
|
|
1615
|
+
if not total_files:
|
|
1616
|
+
return
|
|
1661
1617
|
|
|
1662
1618
|
desc_max_len = max(len(output) + 16, 19)
|
|
1663
1619
|
bar_format = (
|
|
@@ -1667,7 +1623,7 @@ class Catalog:
|
|
|
1667
1623
|
"[{elapsed}<{remaining}, {rate_fmt:>8}]"
|
|
1668
1624
|
)
|
|
1669
1625
|
|
|
1670
|
-
if not
|
|
1626
|
+
if not no_cp:
|
|
1671
1627
|
with get_download_bar(bar_format, total_size) as pbar:
|
|
1672
1628
|
for node_group in node_groups:
|
|
1673
1629
|
node_group.download(recursive=recursive, pbar=pbar)
|
|
@@ -1679,21 +1635,10 @@ class Catalog:
|
|
|
1679
1635
|
total_files,
|
|
1680
1636
|
force,
|
|
1681
1637
|
recursive,
|
|
1682
|
-
|
|
1638
|
+
no_cp,
|
|
1683
1639
|
always_copy_dir_contents,
|
|
1684
1640
|
copy_to_filename,
|
|
1685
1641
|
)
|
|
1686
|
-
if no_edatachain_file:
|
|
1687
|
-
return []
|
|
1688
|
-
|
|
1689
|
-
metafile_data = compute_metafile_data(node_groups)
|
|
1690
|
-
if metafile_data:
|
|
1691
|
-
# Don't write the metafile if nothing was copied
|
|
1692
|
-
print(f"Creating '{dataset_file}'")
|
|
1693
|
-
with open(dataset_file, "w", encoding="utf-8") as fd:
|
|
1694
|
-
yaml.dump(metafile_data, fd, sort_keys=False)
|
|
1695
|
-
|
|
1696
|
-
return metafile_data
|
|
1697
1642
|
|
|
1698
1643
|
def du(
|
|
1699
1644
|
self,
|
datachain/cli/__init__.py
CHANGED
|
@@ -11,7 +11,6 @@ from datachain.telemetry import telemetry
|
|
|
11
11
|
from .commands import (
|
|
12
12
|
clear_cache,
|
|
13
13
|
completion,
|
|
14
|
-
dataset_stats,
|
|
15
14
|
du,
|
|
16
15
|
edit_dataset,
|
|
17
16
|
garbage_collect,
|
|
@@ -47,9 +46,13 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
47
46
|
logging_level = get_logging_level(args)
|
|
48
47
|
logger.setLevel(logging_level)
|
|
49
48
|
|
|
50
|
-
client_config =
|
|
51
|
-
|
|
52
|
-
|
|
49
|
+
client_config = (
|
|
50
|
+
{
|
|
51
|
+
"anon": args.anon,
|
|
52
|
+
}
|
|
53
|
+
if getattr(args, "anon", False)
|
|
54
|
+
else {}
|
|
55
|
+
)
|
|
53
56
|
|
|
54
57
|
if args.debug_sql:
|
|
55
58
|
# This also sets this environment variable for any subprocesses
|
|
@@ -107,9 +110,6 @@ def handle_cp_command(args, catalog):
|
|
|
107
110
|
force=bool(args.force),
|
|
108
111
|
update=bool(args.update),
|
|
109
112
|
recursive=bool(args.recursive),
|
|
110
|
-
edatachain_file=None,
|
|
111
|
-
edatachain_only=False,
|
|
112
|
-
no_edatachain_file=True,
|
|
113
113
|
no_glob=args.no_glob,
|
|
114
114
|
)
|
|
115
115
|
|
|
@@ -181,13 +181,6 @@ def handle_dataset_command(args, catalog):
|
|
|
181
181
|
all=args.all,
|
|
182
182
|
team=args.team,
|
|
183
183
|
),
|
|
184
|
-
"stats": lambda: dataset_stats(
|
|
185
|
-
catalog,
|
|
186
|
-
args.name,
|
|
187
|
-
args.version,
|
|
188
|
-
show_bytes=args.bytes,
|
|
189
|
-
si=args.si,
|
|
190
|
-
),
|
|
191
184
|
}
|
|
192
185
|
|
|
193
186
|
handler = dataset_commands.get(args.datasets_cmd)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from .datasets import (
|
|
2
|
-
dataset_stats,
|
|
3
2
|
edit_dataset,
|
|
4
3
|
list_datasets,
|
|
5
4
|
list_datasets_local,
|
|
@@ -15,7 +14,6 @@ from .show import show
|
|
|
15
14
|
__all__ = [
|
|
16
15
|
"clear_cache",
|
|
17
16
|
"completion",
|
|
18
|
-
"dataset_stats",
|
|
19
17
|
"du",
|
|
20
18
|
"edit_dataset",
|
|
21
19
|
"garbage_collect",
|
|
@@ -3,8 +3,6 @@ from typing import TYPE_CHECKING, Optional
|
|
|
3
3
|
|
|
4
4
|
from tabulate import tabulate
|
|
5
5
|
|
|
6
|
-
from datachain import utils
|
|
7
|
-
|
|
8
6
|
if TYPE_CHECKING:
|
|
9
7
|
from datachain.catalog import Catalog
|
|
10
8
|
|
|
@@ -109,20 +107,3 @@ def edit_dataset(
|
|
|
109
107
|
|
|
110
108
|
if (all or studio) and token:
|
|
111
109
|
edit_studio_dataset(team, name, new_name, description, labels)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def dataset_stats(
|
|
115
|
-
catalog: "Catalog",
|
|
116
|
-
name: str,
|
|
117
|
-
version: int,
|
|
118
|
-
show_bytes=False,
|
|
119
|
-
si=False,
|
|
120
|
-
):
|
|
121
|
-
stats = catalog.dataset_stats(name, version)
|
|
122
|
-
|
|
123
|
-
if stats:
|
|
124
|
-
print(f"Number of objects: {stats.num_objects}")
|
|
125
|
-
if show_bytes:
|
|
126
|
-
print(f"Total objects size: {stats.size}")
|
|
127
|
-
else:
|
|
128
|
-
print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")
|