datachain 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +4 -2
- datachain/catalog/catalog.py +100 -54
- datachain/catalog/datasource.py +4 -6
- datachain/cli/__init__.py +311 -0
- datachain/cli/commands/__init__.py +29 -0
- datachain/cli/commands/datasets.py +129 -0
- datachain/cli/commands/du.py +14 -0
- datachain/cli/commands/index.py +12 -0
- datachain/cli/commands/ls.py +169 -0
- datachain/cli/commands/misc.py +28 -0
- datachain/cli/commands/query.py +53 -0
- datachain/cli/commands/show.py +38 -0
- datachain/cli/parser/__init__.py +547 -0
- datachain/cli/parser/job.py +120 -0
- datachain/cli/parser/studio.py +126 -0
- datachain/cli/parser/utils.py +63 -0
- datachain/{cli_utils.py → cli/utils.py} +27 -1
- datachain/client/azure.py +21 -1
- datachain/client/fsspec.py +45 -13
- datachain/client/gcs.py +10 -2
- datachain/client/local.py +4 -4
- datachain/client/s3.py +10 -0
- datachain/dataset.py +1 -0
- datachain/func/__init__.py +2 -2
- datachain/func/conditional.py +52 -0
- datachain/func/func.py +5 -1
- datachain/lib/arrow.py +4 -0
- datachain/lib/dc.py +18 -3
- datachain/lib/file.py +1 -1
- datachain/lib/listing.py +36 -3
- datachain/lib/signal_schema.py +89 -27
- datachain/listing.py +1 -5
- datachain/node.py +27 -1
- datachain/progress.py +2 -2
- datachain/query/session.py +1 -1
- datachain/studio.py +58 -38
- datachain/utils.py +1 -1
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/METADATA +6 -6
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/RECORD +43 -31
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/WHEEL +1 -1
- datachain/cli.py +0 -1475
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/LICENSE +0 -0
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/top_level.txt +0 -0
datachain/cache.py
CHANGED
|
@@ -61,14 +61,16 @@ class DataChainCache:
|
|
|
61
61
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
62
62
|
size = file.size
|
|
63
63
|
if size < 0:
|
|
64
|
-
size = await client.get_size(from_path)
|
|
64
|
+
size = await client.get_size(from_path, version_id=file.version)
|
|
65
65
|
cb = callback or TqdmCallback(
|
|
66
66
|
tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
|
|
67
67
|
tqdm_cls=Tqdm,
|
|
68
68
|
size=size,
|
|
69
69
|
)
|
|
70
70
|
try:
|
|
71
|
-
await client.get_file(
|
|
71
|
+
await client.get_file(
|
|
72
|
+
from_path, tmp_info, callback=cb, version_id=file.version
|
|
73
|
+
)
|
|
72
74
|
finally:
|
|
73
75
|
if not callback:
|
|
74
76
|
cb.close()
|
datachain/catalog/catalog.py
CHANGED
|
@@ -240,7 +240,8 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
240
240
|
class NodeGroup:
|
|
241
241
|
"""Class for a group of nodes from the same source"""
|
|
242
242
|
|
|
243
|
-
listing: "Listing"
|
|
243
|
+
listing: Optional["Listing"]
|
|
244
|
+
client: "Client"
|
|
244
245
|
sources: list[DataSource]
|
|
245
246
|
|
|
246
247
|
# The source path within the bucket
|
|
@@ -268,9 +269,7 @@ class NodeGroup:
|
|
|
268
269
|
Download this node group to cache.
|
|
269
270
|
"""
|
|
270
271
|
if self.sources:
|
|
271
|
-
self.
|
|
272
|
-
self.iternodes(recursive), shared_progress_bar=pbar
|
|
273
|
-
)
|
|
272
|
+
self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
|
|
274
273
|
|
|
275
274
|
|
|
276
275
|
def check_output_dataset_file(
|
|
@@ -375,7 +374,7 @@ def collect_nodes_for_cp(
|
|
|
375
374
|
|
|
376
375
|
# Collect all sources to process
|
|
377
376
|
for node_group in node_groups:
|
|
378
|
-
listing: Listing = node_group.listing
|
|
377
|
+
listing: Optional[Listing] = node_group.listing
|
|
379
378
|
valid_sources: list[DataSource] = []
|
|
380
379
|
for dsrc in node_group.sources:
|
|
381
380
|
if dsrc.is_single_object():
|
|
@@ -383,6 +382,7 @@ def collect_nodes_for_cp(
|
|
|
383
382
|
total_files += 1
|
|
384
383
|
valid_sources.append(dsrc)
|
|
385
384
|
else:
|
|
385
|
+
assert listing
|
|
386
386
|
node = dsrc.node
|
|
387
387
|
if not recursive:
|
|
388
388
|
print(f"{node.full_path} is a directory (not copied).")
|
|
@@ -433,37 +433,51 @@ def instantiate_node_groups(
|
|
|
433
433
|
)
|
|
434
434
|
|
|
435
435
|
output_dir = output
|
|
436
|
+
output_file = None
|
|
436
437
|
if copy_to_filename:
|
|
437
438
|
output_dir = os.path.dirname(output)
|
|
438
439
|
if not output_dir:
|
|
439
440
|
output_dir = "."
|
|
441
|
+
output_file = os.path.basename(output)
|
|
440
442
|
|
|
441
443
|
# Instantiate these nodes
|
|
442
444
|
for node_group in node_groups:
|
|
443
445
|
if not node_group.sources:
|
|
444
446
|
continue
|
|
445
|
-
listing: Listing = node_group.listing
|
|
447
|
+
listing: Optional[Listing] = node_group.listing
|
|
446
448
|
source_path: str = node_group.source_path
|
|
447
449
|
|
|
448
450
|
copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
|
|
449
|
-
|
|
450
|
-
node_group.sources
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
listing.
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
451
|
+
if not listing:
|
|
452
|
+
source = node_group.sources[0]
|
|
453
|
+
client = source.client
|
|
454
|
+
node = NodeWithPath(source.node, [output_file or source.node.path])
|
|
455
|
+
instantiated_nodes = [node]
|
|
456
|
+
if not virtual_only:
|
|
457
|
+
node.instantiate(
|
|
458
|
+
client, output_dir, instantiate_progress_bar, force=force
|
|
459
|
+
)
|
|
460
|
+
else:
|
|
461
|
+
instantiated_nodes = listing.collect_nodes_to_instantiate(
|
|
462
|
+
node_group.sources,
|
|
463
|
+
copy_to_filename,
|
|
464
|
+
recursive,
|
|
465
|
+
copy_dir_contents,
|
|
466
|
+
source_path,
|
|
467
|
+
node_group.is_edatachain,
|
|
468
|
+
node_group.is_dataset,
|
|
465
469
|
)
|
|
470
|
+
if not virtual_only:
|
|
471
|
+
listing.instantiate_nodes(
|
|
472
|
+
instantiated_nodes,
|
|
473
|
+
output_dir,
|
|
474
|
+
total_files,
|
|
475
|
+
force=force,
|
|
476
|
+
shared_progress_bar=instantiate_progress_bar,
|
|
477
|
+
)
|
|
478
|
+
|
|
466
479
|
node_group.instantiated_nodes = instantiated_nodes
|
|
480
|
+
|
|
467
481
|
if instantiate_progress_bar:
|
|
468
482
|
instantiate_progress_bar.close()
|
|
469
483
|
|
|
@@ -592,7 +606,7 @@ class Catalog:
|
|
|
592
606
|
client_config=None,
|
|
593
607
|
object_name="file",
|
|
594
608
|
skip_indexing=False,
|
|
595
|
-
) -> tuple["Listing", str]:
|
|
609
|
+
) -> tuple[Optional["Listing"], "Client", str]:
|
|
596
610
|
from datachain.lib.dc import DataChain
|
|
597
611
|
from datachain.listing import Listing
|
|
598
612
|
|
|
@@ -603,16 +617,19 @@ class Catalog:
|
|
|
603
617
|
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
604
618
|
source, self.session, update=update
|
|
605
619
|
)
|
|
620
|
+
lst = None
|
|
621
|
+
client = Client.get_client(list_uri, self.cache, **self.client_config)
|
|
622
|
+
|
|
623
|
+
if list_ds_name:
|
|
624
|
+
lst = Listing(
|
|
625
|
+
self.metastore.clone(),
|
|
626
|
+
self.warehouse.clone(),
|
|
627
|
+
client,
|
|
628
|
+
dataset_name=list_ds_name,
|
|
629
|
+
object_name=object_name,
|
|
630
|
+
)
|
|
606
631
|
|
|
607
|
-
lst
|
|
608
|
-
self.metastore.clone(),
|
|
609
|
-
self.warehouse.clone(),
|
|
610
|
-
Client.get_client(list_uri, self.cache, **self.client_config),
|
|
611
|
-
dataset_name=list_ds_name,
|
|
612
|
-
object_name=object_name,
|
|
613
|
-
)
|
|
614
|
-
|
|
615
|
-
return lst, list_path
|
|
632
|
+
return lst, client, list_path
|
|
616
633
|
|
|
617
634
|
def _remove_dataset_rows_and_warehouse_info(
|
|
618
635
|
self, dataset: DatasetRecord, version: int, **kwargs
|
|
@@ -635,13 +652,13 @@ class Catalog:
|
|
|
635
652
|
) -> Optional[list["DataSource"]]:
|
|
636
653
|
enlisted_sources = []
|
|
637
654
|
for src in sources: # Opt: parallel
|
|
638
|
-
listing, file_path = self.enlist_source(
|
|
655
|
+
listing, client, file_path = self.enlist_source(
|
|
639
656
|
src,
|
|
640
657
|
update,
|
|
641
658
|
client_config=client_config or self.client_config,
|
|
642
659
|
skip_indexing=skip_indexing,
|
|
643
660
|
)
|
|
644
|
-
enlisted_sources.append((listing, file_path))
|
|
661
|
+
enlisted_sources.append((listing, client, file_path))
|
|
645
662
|
|
|
646
663
|
if only_index:
|
|
647
664
|
# sometimes we don't really need listing result (e.g on indexing process)
|
|
@@ -649,10 +666,16 @@ class Catalog:
|
|
|
649
666
|
return None
|
|
650
667
|
|
|
651
668
|
dsrc_all: list[DataSource] = []
|
|
652
|
-
for listing, file_path in enlisted_sources:
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
669
|
+
for listing, client, file_path in enlisted_sources:
|
|
670
|
+
if not listing:
|
|
671
|
+
nodes = [Node.from_file(client.get_file_info(file_path))]
|
|
672
|
+
dir_only = False
|
|
673
|
+
else:
|
|
674
|
+
nodes = listing.expand_path(file_path)
|
|
675
|
+
dir_only = file_path.endswith("/")
|
|
676
|
+
dsrc_all.extend(
|
|
677
|
+
DataSource(listing, client, node, dir_only) for node in nodes
|
|
678
|
+
)
|
|
656
679
|
return dsrc_all
|
|
657
680
|
|
|
658
681
|
def enlist_sources_grouped(
|
|
@@ -667,7 +690,7 @@ class Catalog:
|
|
|
667
690
|
|
|
668
691
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
669
692
|
del d["file__source"]
|
|
670
|
-
return Node.
|
|
693
|
+
return Node.from_row(d)
|
|
671
694
|
|
|
672
695
|
enlisted_sources: list[tuple[bool, bool, Any]] = []
|
|
673
696
|
client_config = client_config or self.client_config
|
|
@@ -677,7 +700,7 @@ class Catalog:
|
|
|
677
700
|
edatachain_data = parse_edatachain_file(src)
|
|
678
701
|
indexed_sources = []
|
|
679
702
|
for ds in edatachain_data:
|
|
680
|
-
listing, source_path = self.enlist_source(
|
|
703
|
+
listing, _, source_path = self.enlist_source(
|
|
681
704
|
ds["data-source"]["uri"],
|
|
682
705
|
update,
|
|
683
706
|
client_config=client_config,
|
|
@@ -701,6 +724,7 @@ class Catalog:
|
|
|
701
724
|
client = self.get_client(source, **client_config)
|
|
702
725
|
uri = client.uri
|
|
703
726
|
dataset_name, _, _, _ = get_listing(uri, self.session)
|
|
727
|
+
assert dataset_name
|
|
704
728
|
listing = Listing(
|
|
705
729
|
self.metastore.clone(),
|
|
706
730
|
self.warehouse.clone(),
|
|
@@ -713,6 +737,7 @@ class Catalog:
|
|
|
713
737
|
indexed_sources.append(
|
|
714
738
|
(
|
|
715
739
|
listing,
|
|
740
|
+
client,
|
|
716
741
|
source,
|
|
717
742
|
[_row_to_node(r) for r in rows],
|
|
718
743
|
ds_name,
|
|
@@ -722,25 +747,28 @@ class Catalog:
|
|
|
722
747
|
|
|
723
748
|
enlisted_sources.append((False, True, indexed_sources))
|
|
724
749
|
else:
|
|
725
|
-
listing, source_path = self.enlist_source(
|
|
750
|
+
listing, client, source_path = self.enlist_source(
|
|
726
751
|
src, update, client_config=client_config
|
|
727
752
|
)
|
|
728
|
-
enlisted_sources.append((False, False, (listing, source_path)))
|
|
753
|
+
enlisted_sources.append((False, False, (listing, client, source_path)))
|
|
729
754
|
|
|
730
755
|
node_groups = []
|
|
731
756
|
for is_datachain, is_dataset, payload in enlisted_sources: # Opt: parallel
|
|
732
757
|
if is_dataset:
|
|
733
758
|
for (
|
|
734
759
|
listing,
|
|
760
|
+
client,
|
|
735
761
|
source_path,
|
|
736
762
|
nodes,
|
|
737
763
|
dataset_name,
|
|
738
764
|
dataset_version,
|
|
739
765
|
) in payload:
|
|
740
|
-
|
|
766
|
+
assert listing
|
|
767
|
+
dsrc = [DataSource(listing, client, node) for node in nodes]
|
|
741
768
|
node_groups.append(
|
|
742
769
|
NodeGroup(
|
|
743
770
|
listing,
|
|
771
|
+
client,
|
|
744
772
|
dsrc,
|
|
745
773
|
source_path,
|
|
746
774
|
dataset_name=dataset_name,
|
|
@@ -749,18 +777,30 @@ class Catalog:
|
|
|
749
777
|
)
|
|
750
778
|
elif is_datachain:
|
|
751
779
|
for listing, source_path, paths in payload:
|
|
752
|
-
|
|
780
|
+
assert listing
|
|
781
|
+
dsrc = [
|
|
782
|
+
DataSource(listing, listing.client, listing.resolve_path(p))
|
|
783
|
+
for p in paths
|
|
784
|
+
]
|
|
753
785
|
node_groups.append(
|
|
754
|
-
NodeGroup(
|
|
786
|
+
NodeGroup(
|
|
787
|
+
listing,
|
|
788
|
+
listing.client,
|
|
789
|
+
dsrc,
|
|
790
|
+
source_path,
|
|
791
|
+
is_edatachain=True,
|
|
792
|
+
)
|
|
755
793
|
)
|
|
756
794
|
else:
|
|
757
|
-
listing, source_path = payload
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
795
|
+
listing, client, source_path = payload
|
|
796
|
+
if not listing:
|
|
797
|
+
nodes = [Node.from_file(client.get_file_info(source_path))]
|
|
798
|
+
as_container = False
|
|
799
|
+
else:
|
|
800
|
+
as_container = source_path.endswith("/")
|
|
801
|
+
nodes = listing.expand_path(source_path, use_glob=not no_glob)
|
|
802
|
+
dsrc = [DataSource(listing, client, n, as_container) for n in nodes]
|
|
803
|
+
node_groups.append(NodeGroup(listing, client, dsrc, source_path))
|
|
764
804
|
|
|
765
805
|
return node_groups
|
|
766
806
|
|
|
@@ -1196,10 +1236,16 @@ class Catalog:
|
|
|
1196
1236
|
|
|
1197
1237
|
return q.to_db_records()
|
|
1198
1238
|
|
|
1199
|
-
def signed_url(
|
|
1239
|
+
def signed_url(
|
|
1240
|
+
self,
|
|
1241
|
+
source: str,
|
|
1242
|
+
path: str,
|
|
1243
|
+
version_id: Optional[str] = None,
|
|
1244
|
+
client_config=None,
|
|
1245
|
+
) -> str:
|
|
1200
1246
|
client_config = client_config or self.client_config
|
|
1201
1247
|
client = Client.get_client(source, self.cache, **client_config)
|
|
1202
|
-
return client.url(path)
|
|
1248
|
+
return client.url(path, version_id=version_id)
|
|
1203
1249
|
|
|
1204
1250
|
def export_dataset_table(
|
|
1205
1251
|
self,
|
datachain/catalog/datasource.py
CHANGED
|
@@ -4,21 +4,19 @@ from datachain.node import DirType, NodeWithPath
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class DataSource:
|
|
7
|
-
def __init__(self, listing, node, as_container=False):
|
|
7
|
+
def __init__(self, listing, client, node, as_container=False):
|
|
8
8
|
self.listing = listing
|
|
9
|
+
self.client = client
|
|
9
10
|
self.node = node
|
|
10
11
|
self.as_container = (
|
|
11
12
|
as_container # Indicates whether a .tar file is handled as a container
|
|
12
13
|
)
|
|
13
14
|
|
|
14
|
-
def get_full_path(self):
|
|
15
|
-
return self.get_node_full_path(self.node)
|
|
16
|
-
|
|
17
15
|
def get_node_full_path(self, node):
|
|
18
|
-
return self.
|
|
16
|
+
return self.client.get_full_path(node.full_path)
|
|
19
17
|
|
|
20
18
|
def get_node_full_path_from_path(self, full_path):
|
|
21
|
-
return self.
|
|
19
|
+
return self.client.get_full_path(full_path)
|
|
22
20
|
|
|
23
21
|
def is_single_object(self):
|
|
24
22
|
return self.node.dir_type == DirType.FILE or (
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import traceback
|
|
5
|
+
from multiprocessing import freeze_support
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from datachain.cli.utils import get_logging_level
|
|
9
|
+
from datachain.telemetry import telemetry
|
|
10
|
+
|
|
11
|
+
from .commands import (
|
|
12
|
+
clear_cache,
|
|
13
|
+
completion,
|
|
14
|
+
dataset_stats,
|
|
15
|
+
du,
|
|
16
|
+
edit_dataset,
|
|
17
|
+
garbage_collect,
|
|
18
|
+
index,
|
|
19
|
+
list_datasets,
|
|
20
|
+
ls,
|
|
21
|
+
query,
|
|
22
|
+
rm_dataset,
|
|
23
|
+
show,
|
|
24
|
+
)
|
|
25
|
+
from .parser import get_parser
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("datachain")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
31
|
+
from datachain.catalog import get_catalog
|
|
32
|
+
|
|
33
|
+
# Required for Windows multiprocessing support
|
|
34
|
+
freeze_support()
|
|
35
|
+
|
|
36
|
+
datachain_parser = get_parser()
|
|
37
|
+
args = datachain_parser.parse_args(argv)
|
|
38
|
+
|
|
39
|
+
if args.command in ("internal-run-udf", "internal-run-udf-worker"):
|
|
40
|
+
return handle_udf(args.command)
|
|
41
|
+
|
|
42
|
+
logger.addHandler(logging.StreamHandler())
|
|
43
|
+
logging_level = get_logging_level(args)
|
|
44
|
+
logger.setLevel(logging_level)
|
|
45
|
+
|
|
46
|
+
client_config = {
|
|
47
|
+
"aws_endpoint_url": args.aws_endpoint_url,
|
|
48
|
+
"anon": args.anon,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if args.debug_sql:
|
|
52
|
+
# This also sets this environment variable for any subprocesses
|
|
53
|
+
os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
|
|
54
|
+
|
|
55
|
+
error = None
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
catalog = get_catalog(client_config=client_config)
|
|
59
|
+
return handle_command(args, catalog, client_config)
|
|
60
|
+
except BrokenPipeError as exc:
|
|
61
|
+
error, return_code = handle_broken_pipe_error(exc)
|
|
62
|
+
return return_code
|
|
63
|
+
except (KeyboardInterrupt, Exception) as exc:
|
|
64
|
+
error, return_code = handle_general_exception(exc, args, logging_level)
|
|
65
|
+
return return_code
|
|
66
|
+
finally:
|
|
67
|
+
telemetry.send_cli_call(args.command, error=error)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def handle_command(args, catalog, client_config) -> int:
|
|
71
|
+
"""Handle the different CLI commands."""
|
|
72
|
+
from datachain.studio import process_jobs_args, process_studio_cli_args
|
|
73
|
+
|
|
74
|
+
command_handlers = {
|
|
75
|
+
"cp": lambda: handle_cp_command(args, catalog),
|
|
76
|
+
"clone": lambda: handle_clone_command(args, catalog),
|
|
77
|
+
"dataset": lambda: handle_dataset_command(args, catalog),
|
|
78
|
+
"ds": lambda: handle_dataset_command(args, catalog),
|
|
79
|
+
"ls": lambda: handle_ls_command(args, client_config),
|
|
80
|
+
"show": lambda: handle_show_command(args, catalog),
|
|
81
|
+
"du": lambda: handle_du_command(args, catalog, client_config),
|
|
82
|
+
"find": lambda: handle_find_command(args, catalog),
|
|
83
|
+
"index": lambda: handle_index_command(args, catalog),
|
|
84
|
+
"completion": lambda: handle_completion_command(args),
|
|
85
|
+
"query": lambda: handle_query_command(args, catalog),
|
|
86
|
+
"clear-cache": lambda: clear_cache(catalog),
|
|
87
|
+
"gc": lambda: garbage_collect(catalog),
|
|
88
|
+
"studio": lambda: process_studio_cli_args(args),
|
|
89
|
+
"job": lambda: process_jobs_args(args),
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
handler = command_handlers.get(args.command)
|
|
93
|
+
if handler:
|
|
94
|
+
handler()
|
|
95
|
+
return 0
|
|
96
|
+
print(f"invalid command: {args.command}", file=sys.stderr)
|
|
97
|
+
return 1
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def handle_cp_command(args, catalog):
|
|
101
|
+
catalog.cp(
|
|
102
|
+
args.sources,
|
|
103
|
+
args.output,
|
|
104
|
+
force=bool(args.force),
|
|
105
|
+
update=bool(args.update),
|
|
106
|
+
recursive=bool(args.recursive),
|
|
107
|
+
edatachain_file=None,
|
|
108
|
+
edatachain_only=False,
|
|
109
|
+
no_edatachain_file=True,
|
|
110
|
+
no_glob=args.no_glob,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def handle_clone_command(args, catalog):
|
|
115
|
+
catalog.clone(
|
|
116
|
+
args.sources,
|
|
117
|
+
args.output,
|
|
118
|
+
force=bool(args.force),
|
|
119
|
+
update=bool(args.update),
|
|
120
|
+
recursive=bool(args.recursive),
|
|
121
|
+
no_glob=args.no_glob,
|
|
122
|
+
no_cp=args.no_cp,
|
|
123
|
+
edatachain=args.edatachain,
|
|
124
|
+
edatachain_file=args.edatachain_file,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def handle_dataset_command(args, catalog):
|
|
129
|
+
dataset_commands = {
|
|
130
|
+
"pull": lambda: catalog.pull_dataset(
|
|
131
|
+
args.dataset,
|
|
132
|
+
args.output,
|
|
133
|
+
local_ds_name=args.local_name,
|
|
134
|
+
local_ds_version=args.local_version,
|
|
135
|
+
cp=args.cp,
|
|
136
|
+
force=bool(args.force),
|
|
137
|
+
edatachain=args.edatachain,
|
|
138
|
+
edatachain_file=args.edatachain_file,
|
|
139
|
+
),
|
|
140
|
+
"edit": lambda: edit_dataset(
|
|
141
|
+
catalog,
|
|
142
|
+
args.name,
|
|
143
|
+
new_name=args.new_name,
|
|
144
|
+
description=args.description,
|
|
145
|
+
labels=args.labels,
|
|
146
|
+
studio=args.studio,
|
|
147
|
+
local=args.local,
|
|
148
|
+
all=args.all,
|
|
149
|
+
team=args.team,
|
|
150
|
+
),
|
|
151
|
+
"ls": lambda: list_datasets(
|
|
152
|
+
catalog=catalog,
|
|
153
|
+
studio=args.studio,
|
|
154
|
+
local=args.local,
|
|
155
|
+
all=args.all,
|
|
156
|
+
team=args.team,
|
|
157
|
+
),
|
|
158
|
+
"rm": lambda: rm_dataset(
|
|
159
|
+
catalog,
|
|
160
|
+
args.name,
|
|
161
|
+
version=args.version,
|
|
162
|
+
force=args.force,
|
|
163
|
+
studio=args.studio,
|
|
164
|
+
local=args.local,
|
|
165
|
+
all=args.all,
|
|
166
|
+
team=args.team,
|
|
167
|
+
),
|
|
168
|
+
"remove": lambda: rm_dataset(
|
|
169
|
+
catalog,
|
|
170
|
+
args.name,
|
|
171
|
+
version=args.version,
|
|
172
|
+
force=args.force,
|
|
173
|
+
studio=args.studio,
|
|
174
|
+
local=args.local,
|
|
175
|
+
all=args.all,
|
|
176
|
+
team=args.team,
|
|
177
|
+
),
|
|
178
|
+
"stats": lambda: dataset_stats(
|
|
179
|
+
catalog,
|
|
180
|
+
args.name,
|
|
181
|
+
args.version,
|
|
182
|
+
show_bytes=args.bytes,
|
|
183
|
+
si=args.si,
|
|
184
|
+
),
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
handler = dataset_commands.get(args.datasets_cmd)
|
|
188
|
+
if handler:
|
|
189
|
+
return handler()
|
|
190
|
+
raise Exception(f"Unexpected command {args.datasets_cmd}")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def handle_ls_command(args, client_config):
|
|
194
|
+
ls(
|
|
195
|
+
args.sources,
|
|
196
|
+
long=bool(args.long),
|
|
197
|
+
studio=args.studio,
|
|
198
|
+
local=args.local,
|
|
199
|
+
all=args.all,
|
|
200
|
+
team=args.team,
|
|
201
|
+
update=bool(args.update),
|
|
202
|
+
client_config=client_config,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def handle_show_command(args, catalog):
|
|
207
|
+
show(
|
|
208
|
+
catalog,
|
|
209
|
+
args.name,
|
|
210
|
+
args.version,
|
|
211
|
+
limit=args.limit,
|
|
212
|
+
offset=args.offset,
|
|
213
|
+
columns=args.columns,
|
|
214
|
+
no_collapse=args.no_collapse,
|
|
215
|
+
schema=args.schema,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def handle_du_command(args, catalog, client_config):
|
|
220
|
+
du(
|
|
221
|
+
catalog,
|
|
222
|
+
args.sources,
|
|
223
|
+
show_bytes=args.bytes,
|
|
224
|
+
depth=args.depth,
|
|
225
|
+
si=args.si,
|
|
226
|
+
update=bool(args.update),
|
|
227
|
+
client_config=client_config,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def handle_find_command(args, catalog):
|
|
232
|
+
results_found = False
|
|
233
|
+
for result in catalog.find(
|
|
234
|
+
args.sources,
|
|
235
|
+
update=bool(args.update),
|
|
236
|
+
names=args.name,
|
|
237
|
+
inames=args.iname,
|
|
238
|
+
paths=args.path,
|
|
239
|
+
ipaths=args.ipath,
|
|
240
|
+
size=args.size,
|
|
241
|
+
typ=args.type,
|
|
242
|
+
columns=args.columns,
|
|
243
|
+
):
|
|
244
|
+
print(result)
|
|
245
|
+
results_found = True
|
|
246
|
+
if not results_found:
|
|
247
|
+
print("No results")
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def handle_index_command(args, catalog):
|
|
251
|
+
index(
|
|
252
|
+
catalog,
|
|
253
|
+
args.sources,
|
|
254
|
+
update=bool(args.update),
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def handle_completion_command(args):
|
|
259
|
+
print(completion(args.shell))
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def handle_query_command(args, catalog):
|
|
263
|
+
query(
|
|
264
|
+
catalog,
|
|
265
|
+
args.script,
|
|
266
|
+
parallel=args.parallel,
|
|
267
|
+
params=args.param,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def handle_broken_pipe_error(exc):
|
|
272
|
+
# Python flushes standard streams on exit; redirect remaining output
|
|
273
|
+
# to devnull to avoid another BrokenPipeError at shutdown
|
|
274
|
+
# See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
|
|
275
|
+
error = str(exc)
|
|
276
|
+
devnull = os.open(os.devnull, os.O_WRONLY)
|
|
277
|
+
os.dup2(devnull, sys.stdout.fileno())
|
|
278
|
+
return error, 141 # 128 + 13 (SIGPIPE)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def handle_general_exception(exc, args, logging_level):
|
|
282
|
+
error = str(exc)
|
|
283
|
+
if isinstance(exc, KeyboardInterrupt):
|
|
284
|
+
msg = "Operation cancelled by the user"
|
|
285
|
+
else:
|
|
286
|
+
msg = str(exc)
|
|
287
|
+
print("Error:", msg, file=sys.stderr)
|
|
288
|
+
if logging_level <= logging.DEBUG:
|
|
289
|
+
traceback.print_exception(
|
|
290
|
+
type(exc),
|
|
291
|
+
exc,
|
|
292
|
+
exc.__traceback__,
|
|
293
|
+
file=sys.stderr,
|
|
294
|
+
)
|
|
295
|
+
if args.pdb:
|
|
296
|
+
import pdb # noqa: T100
|
|
297
|
+
|
|
298
|
+
pdb.post_mortem()
|
|
299
|
+
return error, 1
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def handle_udf(command):
|
|
303
|
+
if command == "internal-run-udf":
|
|
304
|
+
from datachain.query.dispatch import udf_entrypoint
|
|
305
|
+
|
|
306
|
+
return udf_entrypoint()
|
|
307
|
+
|
|
308
|
+
if command == "internal-run-udf-worker":
|
|
309
|
+
from datachain.query.dispatch import udf_worker_entrypoint
|
|
310
|
+
|
|
311
|
+
return udf_worker_entrypoint()
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from .datasets import (
|
|
2
|
+
dataset_stats,
|
|
3
|
+
edit_dataset,
|
|
4
|
+
list_datasets,
|
|
5
|
+
list_datasets_local,
|
|
6
|
+
rm_dataset,
|
|
7
|
+
)
|
|
8
|
+
from .du import du
|
|
9
|
+
from .index import index
|
|
10
|
+
from .ls import ls
|
|
11
|
+
from .misc import clear_cache, completion, garbage_collect
|
|
12
|
+
from .query import query
|
|
13
|
+
from .show import show
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"clear_cache",
|
|
17
|
+
"completion",
|
|
18
|
+
"dataset_stats",
|
|
19
|
+
"du",
|
|
20
|
+
"edit_dataset",
|
|
21
|
+
"garbage_collect",
|
|
22
|
+
"index",
|
|
23
|
+
"list_datasets",
|
|
24
|
+
"list_datasets_local",
|
|
25
|
+
"ls",
|
|
26
|
+
"query",
|
|
27
|
+
"rm_dataset",
|
|
28
|
+
"show",
|
|
29
|
+
]
|