datachain 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +61 -219
- datachain/cli.py +136 -22
- datachain/client/fsspec.py +9 -0
- datachain/client/local.py +11 -32
- datachain/config.py +126 -51
- datachain/data_storage/schema.py +66 -33
- datachain/data_storage/sqlite.py +4 -4
- datachain/data_storage/warehouse.py +101 -125
- datachain/lib/arrow.py +2 -15
- datachain/lib/data_model.py +10 -2
- datachain/lib/dc.py +211 -52
- datachain/lib/func/__init__.py +20 -2
- datachain/lib/func/aggregate.py +319 -8
- datachain/lib/func/func.py +97 -9
- datachain/lib/listing.py +6 -21
- datachain/lib/listing_info.py +4 -0
- datachain/lib/signal_schema.py +8 -5
- datachain/lib/udf.py +3 -3
- datachain/lib/utils.py +30 -0
- datachain/listing.py +22 -48
- datachain/query/dataset.py +11 -3
- datachain/remote/studio.py +63 -14
- datachain/studio.py +129 -0
- datachain/utils.py +58 -0
- {datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/METADATA +7 -6
- {datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/RECORD +30 -29
- {datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/WHEEL +1 -1
- {datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/LICENSE +0 -0
- {datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import glob
|
|
2
1
|
import io
|
|
3
2
|
import json
|
|
4
3
|
import logging
|
|
@@ -35,7 +34,6 @@ from tqdm import tqdm
|
|
|
35
34
|
|
|
36
35
|
from datachain.cache import DataChainCache
|
|
37
36
|
from datachain.client import Client
|
|
38
|
-
from datachain.config import get_remote_config, read_config
|
|
39
37
|
from datachain.dataset import (
|
|
40
38
|
DATASET_PREFIX,
|
|
41
39
|
QUERY_DATASET_PREFIX,
|
|
@@ -48,12 +46,10 @@ from datachain.dataset import (
|
|
|
48
46
|
parse_dataset_uri,
|
|
49
47
|
)
|
|
50
48
|
from datachain.error import (
|
|
51
|
-
ClientError,
|
|
52
49
|
DataChainError,
|
|
53
50
|
DatasetInvalidVersionError,
|
|
54
51
|
DatasetNotFoundError,
|
|
55
52
|
DatasetVersionNotFoundError,
|
|
56
|
-
PendingIndexingError,
|
|
57
53
|
QueryScriptCancelError,
|
|
58
54
|
QueryScriptRunError,
|
|
59
55
|
)
|
|
@@ -61,8 +57,8 @@ from datachain.listing import Listing
|
|
|
61
57
|
from datachain.node import DirType, Node, NodeWithPath
|
|
62
58
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
63
59
|
from datachain.remote.studio import StudioClient
|
|
64
|
-
from datachain.sql.types import
|
|
65
|
-
from datachain.storage import
|
|
60
|
+
from datachain.sql.types import DateTime, SQLType, String
|
|
61
|
+
from datachain.storage import StorageURI
|
|
66
62
|
from datachain.utils import (
|
|
67
63
|
DataChainDir,
|
|
68
64
|
batched,
|
|
@@ -102,7 +98,7 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be av
|
|
|
102
98
|
PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
|
|
103
99
|
|
|
104
100
|
|
|
105
|
-
def
|
|
101
|
+
def raise_remote_error(error_message: str) -> NoReturn:
|
|
106
102
|
raise DataChainError(f"Error from server: {error_message}")
|
|
107
103
|
|
|
108
104
|
|
|
@@ -130,7 +126,6 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
130
126
|
self,
|
|
131
127
|
metastore: "AbstractMetastore",
|
|
132
128
|
warehouse: "AbstractWarehouse",
|
|
133
|
-
remote_config: dict[str, Any],
|
|
134
129
|
dataset_name: str,
|
|
135
130
|
dataset_version: int,
|
|
136
131
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
@@ -144,10 +139,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
144
139
|
self.dataset_version = dataset_version
|
|
145
140
|
self.schema = schema
|
|
146
141
|
self.last_status_check: Optional[float] = None
|
|
147
|
-
|
|
148
|
-
self.studio_client = StudioClient(
|
|
149
|
-
remote_config["url"], remote_config["username"], remote_config["token"]
|
|
150
|
-
)
|
|
142
|
+
self.studio_client = StudioClient()
|
|
151
143
|
|
|
152
144
|
def done_task(self, done):
|
|
153
145
|
for task in done:
|
|
@@ -181,14 +173,14 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
181
173
|
self.dataset_name, self.dataset_version
|
|
182
174
|
)
|
|
183
175
|
if not export_status_response.ok:
|
|
184
|
-
|
|
176
|
+
raise_remote_error(export_status_response.message)
|
|
185
177
|
|
|
186
178
|
export_status = export_status_response.data["status"] # type: ignore [index]
|
|
187
179
|
|
|
188
180
|
if export_status == "failed":
|
|
189
|
-
|
|
181
|
+
raise_remote_error("Dataset export failed in Studio")
|
|
190
182
|
if export_status == "removed":
|
|
191
|
-
|
|
183
|
+
raise_remote_error("Dataset export removed in Studio")
|
|
192
184
|
|
|
193
185
|
self.last_status_check = time.time()
|
|
194
186
|
|
|
@@ -483,17 +475,12 @@ def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
|
|
|
483
475
|
if not node_group.sources:
|
|
484
476
|
continue
|
|
485
477
|
listing: Listing = node_group.listing
|
|
486
|
-
|
|
487
|
-
if not node_group.is_dataset:
|
|
488
|
-
assert listing.storage
|
|
489
|
-
data_source = listing.storage.to_dict(source_path)
|
|
490
|
-
else:
|
|
491
|
-
data_source = {"uri": listing.metastore.uri}
|
|
492
|
-
|
|
493
|
-
metafile_group = {"data-source": data_source, "files": []}
|
|
478
|
+
metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
|
|
494
479
|
for node in node_group.instantiated_nodes:
|
|
495
480
|
if not node.n.is_dir:
|
|
496
|
-
metafile_group["files"].append(
|
|
481
|
+
metafile_group["files"].append( # type: ignore [attr-defined]
|
|
482
|
+
node.get_metafile_data()
|
|
483
|
+
)
|
|
497
484
|
if metafile_group["files"]:
|
|
498
485
|
metafile_data.append(metafile_group)
|
|
499
486
|
|
|
@@ -569,6 +556,12 @@ class Catalog:
|
|
|
569
556
|
|
|
570
557
|
return self._warehouse
|
|
571
558
|
|
|
559
|
+
@cached_property
|
|
560
|
+
def session(self):
|
|
561
|
+
from datachain.query.session import Session
|
|
562
|
+
|
|
563
|
+
return Session.get(catalog=self)
|
|
564
|
+
|
|
572
565
|
def get_init_params(self) -> dict[str, Any]:
|
|
573
566
|
return {
|
|
574
567
|
**self._init_params,
|
|
@@ -599,162 +592,29 @@ class Catalog:
|
|
|
599
592
|
def enlist_source(
|
|
600
593
|
self,
|
|
601
594
|
source: str,
|
|
602
|
-
|
|
603
|
-
force_update=False,
|
|
604
|
-
skip_indexing=False,
|
|
595
|
+
update=False,
|
|
605
596
|
client_config=None,
|
|
597
|
+
object_name="file",
|
|
598
|
+
skip_indexing=False,
|
|
606
599
|
) -> tuple[Listing, str]:
|
|
607
|
-
|
|
608
|
-
raise ValueError(
|
|
609
|
-
"Both force_update and skip_indexing flags"
|
|
610
|
-
" cannot be True at the same time"
|
|
611
|
-
)
|
|
612
|
-
|
|
613
|
-
partial_id: Optional[int]
|
|
614
|
-
partial_path: Optional[str]
|
|
600
|
+
from datachain.lib.dc import DataChain
|
|
615
601
|
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
client = Client.get_client(source, self.cache, **client_config)
|
|
619
|
-
stem = os.path.basename(os.path.normpath(path))
|
|
620
|
-
prefix = (
|
|
621
|
-
posixpath.dirname(path)
|
|
622
|
-
if glob.has_magic(stem) or client.fs.isfile(source)
|
|
623
|
-
else path
|
|
602
|
+
DataChain.from_storage(
|
|
603
|
+
source, session=self.session, update=update, object_name=object_name
|
|
624
604
|
)
|
|
625
|
-
storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
|
|
626
|
-
source_metastore = self.metastore.clone(uri)
|
|
627
|
-
|
|
628
|
-
columns = [
|
|
629
|
-
Column("path", String),
|
|
630
|
-
Column("etag", String),
|
|
631
|
-
Column("version", String),
|
|
632
|
-
Column("is_latest", Boolean),
|
|
633
|
-
Column("last_modified", DateTime(timezone=True)),
|
|
634
|
-
Column("size", Int64),
|
|
635
|
-
Column("location", JSON),
|
|
636
|
-
Column("source", String),
|
|
637
|
-
]
|
|
638
|
-
|
|
639
|
-
if skip_indexing:
|
|
640
|
-
source_metastore.create_storage_if_not_registered(uri)
|
|
641
|
-
storage = source_metastore.get_storage(uri)
|
|
642
|
-
source_metastore.init_partial_id(uri)
|
|
643
|
-
partial_id = source_metastore.get_next_partial_id(uri)
|
|
644
|
-
|
|
645
|
-
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
646
|
-
source_metastore.init(uri)
|
|
647
|
-
|
|
648
|
-
source_warehouse = self.warehouse.clone()
|
|
649
|
-
dataset = self.create_dataset(
|
|
650
|
-
storage_dataset_name, columns=columns, listing=True
|
|
651
|
-
)
|
|
652
|
-
|
|
653
|
-
return (
|
|
654
|
-
Listing(storage, source_metastore, source_warehouse, client, dataset),
|
|
655
|
-
path,
|
|
656
|
-
)
|
|
657
|
-
|
|
658
|
-
(
|
|
659
|
-
storage,
|
|
660
|
-
need_index,
|
|
661
|
-
in_progress,
|
|
662
|
-
partial_id,
|
|
663
|
-
partial_path,
|
|
664
|
-
) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
|
|
665
|
-
if in_progress:
|
|
666
|
-
raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
|
|
667
|
-
|
|
668
|
-
if not need_index:
|
|
669
|
-
assert partial_id is not None
|
|
670
|
-
assert partial_path is not None
|
|
671
|
-
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
672
|
-
source_warehouse = self.warehouse.clone()
|
|
673
|
-
dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
674
|
-
lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
|
|
675
|
-
logger.debug(
|
|
676
|
-
"Using cached listing %s. Valid till: %s",
|
|
677
|
-
storage.uri,
|
|
678
|
-
storage.expires_to_local,
|
|
679
|
-
)
|
|
680
|
-
# Listing has to have correct version of data storage
|
|
681
|
-
# initialized with correct Storage
|
|
682
|
-
|
|
683
|
-
self.update_dataset_version_with_warehouse_info(
|
|
684
|
-
dataset,
|
|
685
|
-
dataset.latest_version,
|
|
686
|
-
)
|
|
687
|
-
|
|
688
|
-
return lst, path
|
|
689
605
|
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
source_metastore.init(uri)
|
|
694
|
-
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
695
|
-
|
|
696
|
-
source_warehouse = self.warehouse.clone()
|
|
697
|
-
|
|
698
|
-
dataset = self.create_dataset(
|
|
699
|
-
storage_dataset_name, columns=columns, listing=True
|
|
606
|
+
list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
|
|
607
|
+
source, self.session, update=update
|
|
700
608
|
)
|
|
701
609
|
|
|
702
|
-
lst = Listing(
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
storage.uri,
|
|
709
|
-
StorageStatus.PARTIAL if prefix else StorageStatus.COMPLETE,
|
|
710
|
-
ttl,
|
|
711
|
-
prefix=prefix,
|
|
712
|
-
partial_id=partial_id,
|
|
713
|
-
dataset=dataset,
|
|
714
|
-
)
|
|
715
|
-
|
|
716
|
-
self.update_dataset_version_with_warehouse_info(
|
|
717
|
-
dataset,
|
|
718
|
-
dataset.latest_version,
|
|
719
|
-
)
|
|
720
|
-
|
|
721
|
-
except ClientError as e:
|
|
722
|
-
# for handling cloud errors
|
|
723
|
-
error_message = INDEX_INTERNAL_ERROR_MESSAGE
|
|
724
|
-
if e.error_code in ["InvalidAccessKeyId", "SignatureDoesNotMatch"]:
|
|
725
|
-
error_message = "Invalid cloud credentials"
|
|
726
|
-
|
|
727
|
-
source_metastore.mark_storage_indexed(
|
|
728
|
-
storage.uri,
|
|
729
|
-
StorageStatus.FAILED,
|
|
730
|
-
ttl,
|
|
731
|
-
prefix=prefix,
|
|
732
|
-
error_message=error_message,
|
|
733
|
-
error_stack=traceback.format_exc(),
|
|
734
|
-
dataset=dataset,
|
|
735
|
-
)
|
|
736
|
-
self._remove_dataset_rows_and_warehouse_info(
|
|
737
|
-
dataset, dataset.latest_version
|
|
738
|
-
)
|
|
739
|
-
raise
|
|
740
|
-
except:
|
|
741
|
-
source_metastore.mark_storage_indexed(
|
|
742
|
-
storage.uri,
|
|
743
|
-
StorageStatus.FAILED,
|
|
744
|
-
ttl,
|
|
745
|
-
prefix=prefix,
|
|
746
|
-
error_message=INDEX_INTERNAL_ERROR_MESSAGE,
|
|
747
|
-
error_stack=traceback.format_exc(),
|
|
748
|
-
dataset=dataset,
|
|
749
|
-
)
|
|
750
|
-
self._remove_dataset_rows_and_warehouse_info(
|
|
751
|
-
dataset, dataset.latest_version
|
|
752
|
-
)
|
|
753
|
-
raise
|
|
754
|
-
|
|
755
|
-
lst.storage = storage
|
|
610
|
+
lst = Listing(
|
|
611
|
+
self.warehouse.clone(),
|
|
612
|
+
Client.get_client(list_uri, self.cache, **self.client_config),
|
|
613
|
+
self.get_dataset(list_ds_name),
|
|
614
|
+
object_name=object_name,
|
|
615
|
+
)
|
|
756
616
|
|
|
757
|
-
return lst,
|
|
617
|
+
return lst, list_path
|
|
758
618
|
|
|
759
619
|
def _remove_dataset_rows_and_warehouse_info(
|
|
760
620
|
self, dataset: DatasetRecord, version: int, **kwargs
|
|
@@ -770,7 +630,6 @@ class Catalog:
|
|
|
770
630
|
def enlist_sources(
|
|
771
631
|
self,
|
|
772
632
|
sources: list[str],
|
|
773
|
-
ttl: int,
|
|
774
633
|
update: bool,
|
|
775
634
|
skip_indexing=False,
|
|
776
635
|
client_config=None,
|
|
@@ -780,10 +639,9 @@ class Catalog:
|
|
|
780
639
|
for src in sources: # Opt: parallel
|
|
781
640
|
listing, file_path = self.enlist_source(
|
|
782
641
|
src,
|
|
783
|
-
ttl,
|
|
784
642
|
update,
|
|
785
|
-
skip_indexing=skip_indexing,
|
|
786
643
|
client_config=client_config or self.client_config,
|
|
644
|
+
skip_indexing=skip_indexing,
|
|
787
645
|
)
|
|
788
646
|
enlisted_sources.append((listing, file_path))
|
|
789
647
|
|
|
@@ -802,7 +660,6 @@ class Catalog:
|
|
|
802
660
|
def enlist_sources_grouped(
|
|
803
661
|
self,
|
|
804
662
|
sources: list[str],
|
|
805
|
-
ttl: int,
|
|
806
663
|
update: bool,
|
|
807
664
|
no_glob: bool = False,
|
|
808
665
|
client_config=None,
|
|
@@ -823,7 +680,6 @@ class Catalog:
|
|
|
823
680
|
for ds in edatachain_data:
|
|
824
681
|
listing, source_path = self.enlist_source(
|
|
825
682
|
ds["data-source"]["uri"],
|
|
826
|
-
ttl,
|
|
827
683
|
update,
|
|
828
684
|
client_config=client_config,
|
|
829
685
|
)
|
|
@@ -843,11 +699,13 @@ class Catalog:
|
|
|
843
699
|
)
|
|
844
700
|
indexed_sources = []
|
|
845
701
|
for source in dataset_sources:
|
|
702
|
+
from datachain.lib.dc import DataChain
|
|
703
|
+
|
|
846
704
|
client = self.get_client(source, **client_config)
|
|
847
705
|
uri = client.uri
|
|
848
|
-
ms = self.metastore.clone(uri, None)
|
|
849
706
|
st = self.warehouse.clone()
|
|
850
|
-
|
|
707
|
+
dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
|
|
708
|
+
listing = Listing(st, client, self.get_dataset(dataset_name))
|
|
851
709
|
rows = DatasetQuery(
|
|
852
710
|
name=dataset.name, version=ds_version, catalog=self
|
|
853
711
|
).to_db_records()
|
|
@@ -864,7 +722,7 @@ class Catalog:
|
|
|
864
722
|
enlisted_sources.append((False, True, indexed_sources))
|
|
865
723
|
else:
|
|
866
724
|
listing, source_path = self.enlist_source(
|
|
867
|
-
src,
|
|
725
|
+
src, update, client_config=client_config
|
|
868
726
|
)
|
|
869
727
|
enlisted_sources.append((False, False, (listing, source_path)))
|
|
870
728
|
|
|
@@ -1115,19 +973,16 @@ class Catalog:
|
|
|
1115
973
|
raise ValueError("Sources needs to be non empty list")
|
|
1116
974
|
|
|
1117
975
|
from datachain.lib.dc import DataChain
|
|
1118
|
-
from datachain.query.session import Session
|
|
1119
|
-
|
|
1120
|
-
session = Session.get(catalog=self, client_config=client_config)
|
|
1121
976
|
|
|
1122
977
|
chains = []
|
|
1123
978
|
for source in sources:
|
|
1124
979
|
if source.startswith(DATASET_PREFIX):
|
|
1125
980
|
dc = DataChain.from_dataset(
|
|
1126
|
-
source[len(DATASET_PREFIX) :], session=session
|
|
981
|
+
source[len(DATASET_PREFIX) :], session=self.session
|
|
1127
982
|
)
|
|
1128
983
|
else:
|
|
1129
984
|
dc = DataChain.from_storage(
|
|
1130
|
-
source, session=session, recursive=recursive
|
|
985
|
+
source, session=self.session, recursive=recursive
|
|
1131
986
|
)
|
|
1132
987
|
|
|
1133
988
|
chains.append(dc)
|
|
@@ -1239,17 +1094,12 @@ class Catalog:
|
|
|
1239
1094
|
def get_dataset(self, name: str) -> DatasetRecord:
|
|
1240
1095
|
return self.metastore.get_dataset(name)
|
|
1241
1096
|
|
|
1242
|
-
def get_remote_dataset(self, name: str
|
|
1243
|
-
|
|
1244
|
-
read_config(DataChainDir.find().root), remote=""
|
|
1245
|
-
)
|
|
1246
|
-
studio_client = StudioClient(
|
|
1247
|
-
remote_config["url"], remote_config["username"], remote_config["token"]
|
|
1248
|
-
)
|
|
1097
|
+
def get_remote_dataset(self, name: str) -> DatasetRecord:
|
|
1098
|
+
studio_client = StudioClient()
|
|
1249
1099
|
|
|
1250
1100
|
info_response = studio_client.dataset_info(name)
|
|
1251
1101
|
if not info_response.ok:
|
|
1252
|
-
|
|
1102
|
+
raise_remote_error(info_response.message)
|
|
1253
1103
|
|
|
1254
1104
|
dataset_info = info_response.data
|
|
1255
1105
|
assert isinstance(dataset_info, dict)
|
|
@@ -1306,6 +1156,20 @@ class Catalog:
|
|
|
1306
1156
|
for v in d.versions
|
|
1307
1157
|
)
|
|
1308
1158
|
|
|
1159
|
+
def listings(self):
|
|
1160
|
+
"""
|
|
1161
|
+
Returns list of ListingInfo objects which are representing specific
|
|
1162
|
+
storage listing datasets
|
|
1163
|
+
"""
|
|
1164
|
+
from datachain.lib.listing import is_listing_dataset
|
|
1165
|
+
from datachain.lib.listing_info import ListingInfo
|
|
1166
|
+
|
|
1167
|
+
return [
|
|
1168
|
+
ListingInfo.from_models(d, v, j)
|
|
1169
|
+
for d, v, j in self.list_datasets_versions(include_listing=True)
|
|
1170
|
+
if is_listing_dataset(d.name)
|
|
1171
|
+
]
|
|
1172
|
+
|
|
1309
1173
|
def ls_dataset_rows(
|
|
1310
1174
|
self, name: str, version: int, offset=None, limit=None
|
|
1311
1175
|
) -> list[dict]:
|
|
@@ -1430,7 +1294,6 @@ class Catalog:
|
|
|
1430
1294
|
self,
|
|
1431
1295
|
sources: list[str],
|
|
1432
1296
|
fields: Iterable[str],
|
|
1433
|
-
ttl=TTL_INT,
|
|
1434
1297
|
update=False,
|
|
1435
1298
|
skip_indexing=False,
|
|
1436
1299
|
*,
|
|
@@ -1438,7 +1301,6 @@ class Catalog:
|
|
|
1438
1301
|
) -> Iterator[tuple[DataSource, Iterable[tuple]]]:
|
|
1439
1302
|
data_sources = self.enlist_sources(
|
|
1440
1303
|
sources,
|
|
1441
|
-
ttl,
|
|
1442
1304
|
update,
|
|
1443
1305
|
skip_indexing=skip_indexing,
|
|
1444
1306
|
client_config=client_config or self.client_config,
|
|
@@ -1457,7 +1319,6 @@ class Catalog:
|
|
|
1457
1319
|
edatachain_file: Optional[str] = None,
|
|
1458
1320
|
*,
|
|
1459
1321
|
client_config=None,
|
|
1460
|
-
remote_config=None,
|
|
1461
1322
|
) -> None:
|
|
1462
1323
|
# TODO add progress bar https://github.com/iterative/dvcx/issues/750
|
|
1463
1324
|
# TODO copy correct remote dates https://github.com/iterative/dvcx/issues/new
|
|
@@ -1479,13 +1340,8 @@ class Catalog:
|
|
|
1479
1340
|
raise ValueError("Please provide output directory for instantiation")
|
|
1480
1341
|
|
|
1481
1342
|
client_config = client_config or self.client_config
|
|
1482
|
-
remote_config = remote_config or get_remote_config(
|
|
1483
|
-
read_config(DataChainDir.find().root), remote=""
|
|
1484
|
-
)
|
|
1485
1343
|
|
|
1486
|
-
studio_client = StudioClient(
|
|
1487
|
-
remote_config["url"], remote_config["username"], remote_config["token"]
|
|
1488
|
-
)
|
|
1344
|
+
studio_client = StudioClient()
|
|
1489
1345
|
|
|
1490
1346
|
try:
|
|
1491
1347
|
remote_dataset_name, version = parse_dataset_uri(dataset_uri)
|
|
@@ -1499,9 +1355,7 @@ class Catalog:
|
|
|
1499
1355
|
# we will create new one if it doesn't exist
|
|
1500
1356
|
pass
|
|
1501
1357
|
|
|
1502
|
-
remote_dataset = self.get_remote_dataset(
|
|
1503
|
-
remote_dataset_name, remote_config=remote_config
|
|
1504
|
-
)
|
|
1358
|
+
remote_dataset = self.get_remote_dataset(remote_dataset_name)
|
|
1505
1359
|
# if version is not specified in uri, take the latest one
|
|
1506
1360
|
if not version:
|
|
1507
1361
|
version = remote_dataset.latest_version
|
|
@@ -1526,7 +1380,7 @@ class Catalog:
|
|
|
1526
1380
|
|
|
1527
1381
|
stats_response = studio_client.dataset_stats(remote_dataset_name, version)
|
|
1528
1382
|
if not stats_response.ok:
|
|
1529
|
-
|
|
1383
|
+
raise_remote_error(stats_response.message)
|
|
1530
1384
|
dataset_stats = stats_response.data
|
|
1531
1385
|
|
|
1532
1386
|
dataset_save_progress_bar = tqdm(
|
|
@@ -1558,7 +1412,7 @@ class Catalog:
|
|
|
1558
1412
|
remote_dataset_name, version
|
|
1559
1413
|
)
|
|
1560
1414
|
if not export_response.ok:
|
|
1561
|
-
|
|
1415
|
+
raise_remote_error(export_response.message)
|
|
1562
1416
|
|
|
1563
1417
|
signed_urls = export_response.data
|
|
1564
1418
|
|
|
@@ -1572,7 +1426,6 @@ class Catalog:
|
|
|
1572
1426
|
rows_fetcher = DatasetRowsFetcher(
|
|
1573
1427
|
metastore,
|
|
1574
1428
|
warehouse,
|
|
1575
|
-
remote_config,
|
|
1576
1429
|
dataset.name,
|
|
1577
1430
|
version,
|
|
1578
1431
|
schema,
|
|
@@ -1615,7 +1468,6 @@ class Catalog:
|
|
|
1615
1468
|
no_cp: bool = False,
|
|
1616
1469
|
edatachain: bool = False,
|
|
1617
1470
|
edatachain_file: Optional[str] = None,
|
|
1618
|
-
ttl: int = TTL_INT,
|
|
1619
1471
|
*,
|
|
1620
1472
|
client_config=None,
|
|
1621
1473
|
) -> None:
|
|
@@ -1637,7 +1489,6 @@ class Catalog:
|
|
|
1637
1489
|
edatachain_only=no_cp,
|
|
1638
1490
|
no_edatachain_file=not edatachain,
|
|
1639
1491
|
edatachain_file=edatachain_file,
|
|
1640
|
-
ttl=ttl,
|
|
1641
1492
|
client_config=client_config,
|
|
1642
1493
|
)
|
|
1643
1494
|
else:
|
|
@@ -1645,7 +1496,6 @@ class Catalog:
|
|
|
1645
1496
|
# it needs to be done here
|
|
1646
1497
|
self.enlist_sources(
|
|
1647
1498
|
sources,
|
|
1648
|
-
ttl,
|
|
1649
1499
|
update,
|
|
1650
1500
|
client_config=client_config or self.client_config,
|
|
1651
1501
|
)
|
|
@@ -1705,7 +1555,6 @@ class Catalog:
|
|
|
1705
1555
|
edatachain_only: bool = False,
|
|
1706
1556
|
no_edatachain_file: bool = False,
|
|
1707
1557
|
no_glob: bool = False,
|
|
1708
|
-
ttl: int = TTL_INT,
|
|
1709
1558
|
*,
|
|
1710
1559
|
client_config=None,
|
|
1711
1560
|
) -> list[dict[str, Any]]:
|
|
@@ -1717,7 +1566,6 @@ class Catalog:
|
|
|
1717
1566
|
client_config = client_config or self.client_config
|
|
1718
1567
|
node_groups = self.enlist_sources_grouped(
|
|
1719
1568
|
sources,
|
|
1720
|
-
ttl,
|
|
1721
1569
|
update,
|
|
1722
1570
|
no_glob,
|
|
1723
1571
|
client_config=client_config,
|
|
@@ -1776,14 +1624,12 @@ class Catalog:
|
|
|
1776
1624
|
self,
|
|
1777
1625
|
sources,
|
|
1778
1626
|
depth=0,
|
|
1779
|
-
ttl=TTL_INT,
|
|
1780
1627
|
update=False,
|
|
1781
1628
|
*,
|
|
1782
1629
|
client_config=None,
|
|
1783
1630
|
) -> Iterable[tuple[str, float]]:
|
|
1784
1631
|
sources = self.enlist_sources(
|
|
1785
1632
|
sources,
|
|
1786
|
-
ttl,
|
|
1787
1633
|
update,
|
|
1788
1634
|
client_config=client_config or self.client_config,
|
|
1789
1635
|
)
|
|
@@ -1804,7 +1650,6 @@ class Catalog:
|
|
|
1804
1650
|
def find(
|
|
1805
1651
|
self,
|
|
1806
1652
|
sources,
|
|
1807
|
-
ttl=TTL_INT,
|
|
1808
1653
|
update=False,
|
|
1809
1654
|
names=None,
|
|
1810
1655
|
inames=None,
|
|
@@ -1818,7 +1663,6 @@ class Catalog:
|
|
|
1818
1663
|
) -> Iterator[str]:
|
|
1819
1664
|
sources = self.enlist_sources(
|
|
1820
1665
|
sources,
|
|
1821
|
-
ttl,
|
|
1822
1666
|
update,
|
|
1823
1667
|
client_config=client_config or self.client_config,
|
|
1824
1668
|
)
|
|
@@ -1854,7 +1698,6 @@ class Catalog:
|
|
|
1854
1698
|
def index(
|
|
1855
1699
|
self,
|
|
1856
1700
|
sources,
|
|
1857
|
-
ttl=TTL_INT,
|
|
1858
1701
|
update=False,
|
|
1859
1702
|
*,
|
|
1860
1703
|
client_config=None,
|
|
@@ -1880,7 +1723,6 @@ class Catalog:
|
|
|
1880
1723
|
|
|
1881
1724
|
self.enlist_sources(
|
|
1882
1725
|
non_root_sources,
|
|
1883
|
-
ttl,
|
|
1884
1726
|
update,
|
|
1885
1727
|
client_config=client_config,
|
|
1886
1728
|
only_index=True,
|