datachain 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/catalog/catalog.py +62 -228
- datachain/cli.py +136 -22
- datachain/client/fsspec.py +9 -0
- datachain/client/local.py +11 -32
- datachain/config.py +126 -51
- datachain/data_storage/schema.py +66 -33
- datachain/data_storage/sqlite.py +12 -4
- datachain/data_storage/warehouse.py +101 -129
- datachain/lib/convert/sql_to_python.py +8 -12
- datachain/lib/dc.py +275 -80
- datachain/lib/func/__init__.py +32 -0
- datachain/lib/func/aggregate.py +353 -0
- datachain/lib/func/func.py +152 -0
- datachain/lib/listing.py +6 -21
- datachain/lib/listing_info.py +4 -0
- datachain/lib/signal_schema.py +17 -8
- datachain/lib/udf.py +3 -3
- datachain/lib/utils.py +5 -0
- datachain/listing.py +22 -48
- datachain/query/__init__.py +1 -2
- datachain/query/batch.py +0 -1
- datachain/query/dataset.py +33 -46
- datachain/query/schema.py +1 -61
- datachain/query/session.py +33 -25
- datachain/remote/studio.py +63 -14
- datachain/sql/functions/__init__.py +1 -1
- datachain/sql/functions/aggregate.py +47 -0
- datachain/sql/functions/array.py +0 -8
- datachain/sql/sqlite/base.py +20 -2
- datachain/studio.py +129 -0
- datachain/utils.py +58 -0
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/METADATA +7 -6
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/RECORD +38 -33
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/WHEEL +1 -1
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/LICENSE +0 -0
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from datachain.lib import func
|
|
1
2
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
2
3
|
from datachain.lib.dc import C, Column, DataChain, Sys
|
|
3
4
|
from datachain.lib.file import (
|
|
@@ -34,6 +35,7 @@ __all__ = [
|
|
|
34
35
|
"Sys",
|
|
35
36
|
"TarVFile",
|
|
36
37
|
"TextFile",
|
|
38
|
+
"func",
|
|
37
39
|
"is_chain_type",
|
|
38
40
|
"metrics",
|
|
39
41
|
"param",
|
datachain/catalog/catalog.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import glob
|
|
2
1
|
import io
|
|
3
2
|
import json
|
|
4
3
|
import logging
|
|
@@ -35,7 +34,6 @@ from tqdm import tqdm
|
|
|
35
34
|
|
|
36
35
|
from datachain.cache import DataChainCache
|
|
37
36
|
from datachain.client import Client
|
|
38
|
-
from datachain.config import get_remote_config, read_config
|
|
39
37
|
from datachain.dataset import (
|
|
40
38
|
DATASET_PREFIX,
|
|
41
39
|
QUERY_DATASET_PREFIX,
|
|
@@ -48,12 +46,10 @@ from datachain.dataset import (
|
|
|
48
46
|
parse_dataset_uri,
|
|
49
47
|
)
|
|
50
48
|
from datachain.error import (
|
|
51
|
-
ClientError,
|
|
52
49
|
DataChainError,
|
|
53
50
|
DatasetInvalidVersionError,
|
|
54
51
|
DatasetNotFoundError,
|
|
55
52
|
DatasetVersionNotFoundError,
|
|
56
|
-
PendingIndexingError,
|
|
57
53
|
QueryScriptCancelError,
|
|
58
54
|
QueryScriptRunError,
|
|
59
55
|
)
|
|
@@ -61,8 +57,8 @@ from datachain.listing import Listing
|
|
|
61
57
|
from datachain.node import DirType, Node, NodeWithPath
|
|
62
58
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
63
59
|
from datachain.remote.studio import StudioClient
|
|
64
|
-
from datachain.sql.types import
|
|
65
|
-
from datachain.storage import
|
|
60
|
+
from datachain.sql.types import DateTime, SQLType, String
|
|
61
|
+
from datachain.storage import StorageURI
|
|
66
62
|
from datachain.utils import (
|
|
67
63
|
DataChainDir,
|
|
68
64
|
batched,
|
|
@@ -102,7 +98,7 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be av
|
|
|
102
98
|
PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
|
|
103
99
|
|
|
104
100
|
|
|
105
|
-
def
|
|
101
|
+
def raise_remote_error(error_message: str) -> NoReturn:
|
|
106
102
|
raise DataChainError(f"Error from server: {error_message}")
|
|
107
103
|
|
|
108
104
|
|
|
@@ -130,7 +126,6 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
130
126
|
self,
|
|
131
127
|
metastore: "AbstractMetastore",
|
|
132
128
|
warehouse: "AbstractWarehouse",
|
|
133
|
-
remote_config: dict[str, Any],
|
|
134
129
|
dataset_name: str,
|
|
135
130
|
dataset_version: int,
|
|
136
131
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
@@ -144,10 +139,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
144
139
|
self.dataset_version = dataset_version
|
|
145
140
|
self.schema = schema
|
|
146
141
|
self.last_status_check: Optional[float] = None
|
|
147
|
-
|
|
148
|
-
self.studio_client = StudioClient(
|
|
149
|
-
remote_config["url"], remote_config["username"], remote_config["token"]
|
|
150
|
-
)
|
|
142
|
+
self.studio_client = StudioClient()
|
|
151
143
|
|
|
152
144
|
def done_task(self, done):
|
|
153
145
|
for task in done:
|
|
@@ -181,14 +173,14 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
181
173
|
self.dataset_name, self.dataset_version
|
|
182
174
|
)
|
|
183
175
|
if not export_status_response.ok:
|
|
184
|
-
|
|
176
|
+
raise_remote_error(export_status_response.message)
|
|
185
177
|
|
|
186
178
|
export_status = export_status_response.data["status"] # type: ignore [index]
|
|
187
179
|
|
|
188
180
|
if export_status == "failed":
|
|
189
|
-
|
|
181
|
+
raise_remote_error("Dataset export failed in Studio")
|
|
190
182
|
if export_status == "removed":
|
|
191
|
-
|
|
183
|
+
raise_remote_error("Dataset export removed in Studio")
|
|
192
184
|
|
|
193
185
|
self.last_status_check = time.time()
|
|
194
186
|
|
|
@@ -483,17 +475,12 @@ def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
|
|
|
483
475
|
if not node_group.sources:
|
|
484
476
|
continue
|
|
485
477
|
listing: Listing = node_group.listing
|
|
486
|
-
|
|
487
|
-
if not node_group.is_dataset:
|
|
488
|
-
assert listing.storage
|
|
489
|
-
data_source = listing.storage.to_dict(source_path)
|
|
490
|
-
else:
|
|
491
|
-
data_source = {"uri": listing.metastore.uri}
|
|
492
|
-
|
|
493
|
-
metafile_group = {"data-source": data_source, "files": []}
|
|
478
|
+
metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
|
|
494
479
|
for node in node_group.instantiated_nodes:
|
|
495
480
|
if not node.n.is_dir:
|
|
496
|
-
metafile_group["files"].append(
|
|
481
|
+
metafile_group["files"].append( # type: ignore [attr-defined]
|
|
482
|
+
node.get_metafile_data()
|
|
483
|
+
)
|
|
497
484
|
if metafile_group["files"]:
|
|
498
485
|
metafile_data.append(metafile_group)
|
|
499
486
|
|
|
@@ -569,6 +556,12 @@ class Catalog:
|
|
|
569
556
|
|
|
570
557
|
return self._warehouse
|
|
571
558
|
|
|
559
|
+
@cached_property
|
|
560
|
+
def session(self):
|
|
561
|
+
from datachain.query.session import Session
|
|
562
|
+
|
|
563
|
+
return Session.get(catalog=self)
|
|
564
|
+
|
|
572
565
|
def get_init_params(self) -> dict[str, Any]:
|
|
573
566
|
return {
|
|
574
567
|
**self._init_params,
|
|
@@ -599,162 +592,29 @@ class Catalog:
|
|
|
599
592
|
def enlist_source(
|
|
600
593
|
self,
|
|
601
594
|
source: str,
|
|
602
|
-
|
|
603
|
-
force_update=False,
|
|
604
|
-
skip_indexing=False,
|
|
595
|
+
update=False,
|
|
605
596
|
client_config=None,
|
|
597
|
+
object_name="file",
|
|
598
|
+
skip_indexing=False,
|
|
606
599
|
) -> tuple[Listing, str]:
|
|
607
|
-
|
|
608
|
-
raise ValueError(
|
|
609
|
-
"Both force_update and skip_indexing flags"
|
|
610
|
-
" cannot be True at the same time"
|
|
611
|
-
)
|
|
612
|
-
|
|
613
|
-
partial_id: Optional[int]
|
|
614
|
-
partial_path: Optional[str]
|
|
600
|
+
from datachain.lib.dc import DataChain
|
|
615
601
|
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
client = Client.get_client(source, self.cache, **client_config)
|
|
619
|
-
stem = os.path.basename(os.path.normpath(path))
|
|
620
|
-
prefix = (
|
|
621
|
-
posixpath.dirname(path)
|
|
622
|
-
if glob.has_magic(stem) or client.fs.isfile(source)
|
|
623
|
-
else path
|
|
602
|
+
DataChain.from_storage(
|
|
603
|
+
source, session=self.session, update=update, object_name=object_name
|
|
624
604
|
)
|
|
625
|
-
storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
|
|
626
|
-
source_metastore = self.metastore.clone(uri)
|
|
627
|
-
|
|
628
|
-
columns = [
|
|
629
|
-
Column("path", String),
|
|
630
|
-
Column("etag", String),
|
|
631
|
-
Column("version", String),
|
|
632
|
-
Column("is_latest", Boolean),
|
|
633
|
-
Column("last_modified", DateTime(timezone=True)),
|
|
634
|
-
Column("size", Int64),
|
|
635
|
-
Column("location", JSON),
|
|
636
|
-
Column("source", String),
|
|
637
|
-
]
|
|
638
|
-
|
|
639
|
-
if skip_indexing:
|
|
640
|
-
source_metastore.create_storage_if_not_registered(uri)
|
|
641
|
-
storage = source_metastore.get_storage(uri)
|
|
642
|
-
source_metastore.init_partial_id(uri)
|
|
643
|
-
partial_id = source_metastore.get_next_partial_id(uri)
|
|
644
|
-
|
|
645
|
-
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
646
|
-
source_metastore.init(uri)
|
|
647
|
-
|
|
648
|
-
source_warehouse = self.warehouse.clone()
|
|
649
|
-
dataset = self.create_dataset(
|
|
650
|
-
storage_dataset_name, columns=columns, listing=True
|
|
651
|
-
)
|
|
652
|
-
|
|
653
|
-
return (
|
|
654
|
-
Listing(storage, source_metastore, source_warehouse, client, dataset),
|
|
655
|
-
path,
|
|
656
|
-
)
|
|
657
|
-
|
|
658
|
-
(
|
|
659
|
-
storage,
|
|
660
|
-
need_index,
|
|
661
|
-
in_progress,
|
|
662
|
-
partial_id,
|
|
663
|
-
partial_path,
|
|
664
|
-
) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
|
|
665
|
-
if in_progress:
|
|
666
|
-
raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
|
|
667
|
-
|
|
668
|
-
if not need_index:
|
|
669
|
-
assert partial_id is not None
|
|
670
|
-
assert partial_path is not None
|
|
671
|
-
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
672
|
-
source_warehouse = self.warehouse.clone()
|
|
673
|
-
dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
674
|
-
lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
|
|
675
|
-
logger.debug(
|
|
676
|
-
"Using cached listing %s. Valid till: %s",
|
|
677
|
-
storage.uri,
|
|
678
|
-
storage.expires_to_local,
|
|
679
|
-
)
|
|
680
|
-
# Listing has to have correct version of data storage
|
|
681
|
-
# initialized with correct Storage
|
|
682
|
-
|
|
683
|
-
self.update_dataset_version_with_warehouse_info(
|
|
684
|
-
dataset,
|
|
685
|
-
dataset.latest_version,
|
|
686
|
-
)
|
|
687
|
-
|
|
688
|
-
return lst, path
|
|
689
|
-
|
|
690
|
-
source_metastore.init_partial_id(uri)
|
|
691
|
-
partial_id = source_metastore.get_next_partial_id(uri)
|
|
692
|
-
|
|
693
|
-
source_metastore.init(uri)
|
|
694
|
-
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
695
605
|
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
dataset = self.create_dataset(
|
|
699
|
-
storage_dataset_name, columns=columns, listing=True
|
|
606
|
+
list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
|
|
607
|
+
source, self.session, update=update
|
|
700
608
|
)
|
|
701
609
|
|
|
702
|
-
lst = Listing(
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
storage.uri,
|
|
709
|
-
StorageStatus.PARTIAL if prefix else StorageStatus.COMPLETE,
|
|
710
|
-
ttl,
|
|
711
|
-
prefix=prefix,
|
|
712
|
-
partial_id=partial_id,
|
|
713
|
-
dataset=dataset,
|
|
714
|
-
)
|
|
715
|
-
|
|
716
|
-
self.update_dataset_version_with_warehouse_info(
|
|
717
|
-
dataset,
|
|
718
|
-
dataset.latest_version,
|
|
719
|
-
)
|
|
720
|
-
|
|
721
|
-
except ClientError as e:
|
|
722
|
-
# for handling cloud errors
|
|
723
|
-
error_message = INDEX_INTERNAL_ERROR_MESSAGE
|
|
724
|
-
if e.error_code in ["InvalidAccessKeyId", "SignatureDoesNotMatch"]:
|
|
725
|
-
error_message = "Invalid cloud credentials"
|
|
726
|
-
|
|
727
|
-
source_metastore.mark_storage_indexed(
|
|
728
|
-
storage.uri,
|
|
729
|
-
StorageStatus.FAILED,
|
|
730
|
-
ttl,
|
|
731
|
-
prefix=prefix,
|
|
732
|
-
error_message=error_message,
|
|
733
|
-
error_stack=traceback.format_exc(),
|
|
734
|
-
dataset=dataset,
|
|
735
|
-
)
|
|
736
|
-
self._remove_dataset_rows_and_warehouse_info(
|
|
737
|
-
dataset, dataset.latest_version
|
|
738
|
-
)
|
|
739
|
-
raise
|
|
740
|
-
except:
|
|
741
|
-
source_metastore.mark_storage_indexed(
|
|
742
|
-
storage.uri,
|
|
743
|
-
StorageStatus.FAILED,
|
|
744
|
-
ttl,
|
|
745
|
-
prefix=prefix,
|
|
746
|
-
error_message=INDEX_INTERNAL_ERROR_MESSAGE,
|
|
747
|
-
error_stack=traceback.format_exc(),
|
|
748
|
-
dataset=dataset,
|
|
749
|
-
)
|
|
750
|
-
self._remove_dataset_rows_and_warehouse_info(
|
|
751
|
-
dataset, dataset.latest_version
|
|
752
|
-
)
|
|
753
|
-
raise
|
|
754
|
-
|
|
755
|
-
lst.storage = storage
|
|
610
|
+
lst = Listing(
|
|
611
|
+
self.warehouse.clone(),
|
|
612
|
+
Client.get_client(list_uri, self.cache, **self.client_config),
|
|
613
|
+
self.get_dataset(list_ds_name),
|
|
614
|
+
object_name=object_name,
|
|
615
|
+
)
|
|
756
616
|
|
|
757
|
-
return lst,
|
|
617
|
+
return lst, list_path
|
|
758
618
|
|
|
759
619
|
def _remove_dataset_rows_and_warehouse_info(
|
|
760
620
|
self, dataset: DatasetRecord, version: int, **kwargs
|
|
@@ -770,7 +630,6 @@ class Catalog:
|
|
|
770
630
|
def enlist_sources(
|
|
771
631
|
self,
|
|
772
632
|
sources: list[str],
|
|
773
|
-
ttl: int,
|
|
774
633
|
update: bool,
|
|
775
634
|
skip_indexing=False,
|
|
776
635
|
client_config=None,
|
|
@@ -780,10 +639,9 @@ class Catalog:
|
|
|
780
639
|
for src in sources: # Opt: parallel
|
|
781
640
|
listing, file_path = self.enlist_source(
|
|
782
641
|
src,
|
|
783
|
-
ttl,
|
|
784
642
|
update,
|
|
785
|
-
skip_indexing=skip_indexing,
|
|
786
643
|
client_config=client_config or self.client_config,
|
|
644
|
+
skip_indexing=skip_indexing,
|
|
787
645
|
)
|
|
788
646
|
enlisted_sources.append((listing, file_path))
|
|
789
647
|
|
|
@@ -802,7 +660,6 @@ class Catalog:
|
|
|
802
660
|
def enlist_sources_grouped(
|
|
803
661
|
self,
|
|
804
662
|
sources: list[str],
|
|
805
|
-
ttl: int,
|
|
806
663
|
update: bool,
|
|
807
664
|
no_glob: bool = False,
|
|
808
665
|
client_config=None,
|
|
@@ -823,7 +680,6 @@ class Catalog:
|
|
|
823
680
|
for ds in edatachain_data:
|
|
824
681
|
listing, source_path = self.enlist_source(
|
|
825
682
|
ds["data-source"]["uri"],
|
|
826
|
-
ttl,
|
|
827
683
|
update,
|
|
828
684
|
client_config=client_config,
|
|
829
685
|
)
|
|
@@ -843,11 +699,13 @@ class Catalog:
|
|
|
843
699
|
)
|
|
844
700
|
indexed_sources = []
|
|
845
701
|
for source in dataset_sources:
|
|
702
|
+
from datachain.lib.dc import DataChain
|
|
703
|
+
|
|
846
704
|
client = self.get_client(source, **client_config)
|
|
847
705
|
uri = client.uri
|
|
848
|
-
ms = self.metastore.clone(uri, None)
|
|
849
706
|
st = self.warehouse.clone()
|
|
850
|
-
|
|
707
|
+
dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
|
|
708
|
+
listing = Listing(st, client, self.get_dataset(dataset_name))
|
|
851
709
|
rows = DatasetQuery(
|
|
852
710
|
name=dataset.name, version=ds_version, catalog=self
|
|
853
711
|
).to_db_records()
|
|
@@ -864,7 +722,7 @@ class Catalog:
|
|
|
864
722
|
enlisted_sources.append((False, True, indexed_sources))
|
|
865
723
|
else:
|
|
866
724
|
listing, source_path = self.enlist_source(
|
|
867
|
-
src,
|
|
725
|
+
src, update, client_config=client_config
|
|
868
726
|
)
|
|
869
727
|
enlisted_sources.append((False, False, (listing, source_path)))
|
|
870
728
|
|
|
@@ -989,13 +847,6 @@ class Catalog:
|
|
|
989
847
|
c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
|
|
990
848
|
}
|
|
991
849
|
|
|
992
|
-
job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
|
|
993
|
-
if not job_id:
|
|
994
|
-
from datachain.query.session import Session
|
|
995
|
-
|
|
996
|
-
session = Session.get(catalog=self)
|
|
997
|
-
job_id = session.job_id
|
|
998
|
-
|
|
999
850
|
dataset = self.metastore.create_dataset_version(
|
|
1000
851
|
dataset,
|
|
1001
852
|
version,
|
|
@@ -1122,19 +973,16 @@ class Catalog:
|
|
|
1122
973
|
raise ValueError("Sources needs to be non empty list")
|
|
1123
974
|
|
|
1124
975
|
from datachain.lib.dc import DataChain
|
|
1125
|
-
from datachain.query.session import Session
|
|
1126
|
-
|
|
1127
|
-
session = Session.get(catalog=self, client_config=client_config)
|
|
1128
976
|
|
|
1129
977
|
chains = []
|
|
1130
978
|
for source in sources:
|
|
1131
979
|
if source.startswith(DATASET_PREFIX):
|
|
1132
980
|
dc = DataChain.from_dataset(
|
|
1133
|
-
source[len(DATASET_PREFIX) :], session=session
|
|
981
|
+
source[len(DATASET_PREFIX) :], session=self.session
|
|
1134
982
|
)
|
|
1135
983
|
else:
|
|
1136
984
|
dc = DataChain.from_storage(
|
|
1137
|
-
source, session=session, recursive=recursive
|
|
985
|
+
source, session=self.session, recursive=recursive
|
|
1138
986
|
)
|
|
1139
987
|
|
|
1140
988
|
chains.append(dc)
|
|
@@ -1218,6 +1066,7 @@ class Catalog:
|
|
|
1218
1066
|
preview=dataset_version.preview,
|
|
1219
1067
|
job_id=dataset_version.job_id,
|
|
1220
1068
|
)
|
|
1069
|
+
|
|
1221
1070
|
# to avoid re-creating rows table, we are just renaming it for a new version
|
|
1222
1071
|
# of target dataset
|
|
1223
1072
|
self.warehouse.rename_dataset_table(
|
|
@@ -1245,17 +1094,12 @@ class Catalog:
|
|
|
1245
1094
|
def get_dataset(self, name: str) -> DatasetRecord:
|
|
1246
1095
|
return self.metastore.get_dataset(name)
|
|
1247
1096
|
|
|
1248
|
-
def get_remote_dataset(self, name: str
|
|
1249
|
-
|
|
1250
|
-
read_config(DataChainDir.find().root), remote=""
|
|
1251
|
-
)
|
|
1252
|
-
studio_client = StudioClient(
|
|
1253
|
-
remote_config["url"], remote_config["username"], remote_config["token"]
|
|
1254
|
-
)
|
|
1097
|
+
def get_remote_dataset(self, name: str) -> DatasetRecord:
|
|
1098
|
+
studio_client = StudioClient()
|
|
1255
1099
|
|
|
1256
1100
|
info_response = studio_client.dataset_info(name)
|
|
1257
1101
|
if not info_response.ok:
|
|
1258
|
-
|
|
1102
|
+
raise_remote_error(info_response.message)
|
|
1259
1103
|
|
|
1260
1104
|
dataset_info = info_response.data
|
|
1261
1105
|
assert isinstance(dataset_info, dict)
|
|
@@ -1312,6 +1156,20 @@ class Catalog:
|
|
|
1312
1156
|
for v in d.versions
|
|
1313
1157
|
)
|
|
1314
1158
|
|
|
1159
|
+
def listings(self):
|
|
1160
|
+
"""
|
|
1161
|
+
Returns list of ListingInfo objects which are representing specific
|
|
1162
|
+
storage listing datasets
|
|
1163
|
+
"""
|
|
1164
|
+
from datachain.lib.listing import is_listing_dataset
|
|
1165
|
+
from datachain.lib.listing_info import ListingInfo
|
|
1166
|
+
|
|
1167
|
+
return [
|
|
1168
|
+
ListingInfo.from_models(d, v, j)
|
|
1169
|
+
for d, v, j in self.list_datasets_versions(include_listing=True)
|
|
1170
|
+
if is_listing_dataset(d.name)
|
|
1171
|
+
]
|
|
1172
|
+
|
|
1315
1173
|
def ls_dataset_rows(
|
|
1316
1174
|
self, name: str, version: int, offset=None, limit=None
|
|
1317
1175
|
) -> list[dict]:
|
|
@@ -1325,8 +1183,6 @@ class Catalog:
|
|
|
1325
1183
|
if offset:
|
|
1326
1184
|
q = q.offset(offset)
|
|
1327
1185
|
|
|
1328
|
-
q = q.order_by("sys__id")
|
|
1329
|
-
|
|
1330
1186
|
return q.to_db_records()
|
|
1331
1187
|
|
|
1332
1188
|
def signed_url(self, source: str, path: str, client_config=None) -> str:
|
|
@@ -1438,7 +1294,6 @@ class Catalog:
|
|
|
1438
1294
|
self,
|
|
1439
1295
|
sources: list[str],
|
|
1440
1296
|
fields: Iterable[str],
|
|
1441
|
-
ttl=TTL_INT,
|
|
1442
1297
|
update=False,
|
|
1443
1298
|
skip_indexing=False,
|
|
1444
1299
|
*,
|
|
@@ -1446,7 +1301,6 @@ class Catalog:
|
|
|
1446
1301
|
) -> Iterator[tuple[DataSource, Iterable[tuple]]]:
|
|
1447
1302
|
data_sources = self.enlist_sources(
|
|
1448
1303
|
sources,
|
|
1449
|
-
ttl,
|
|
1450
1304
|
update,
|
|
1451
1305
|
skip_indexing=skip_indexing,
|
|
1452
1306
|
client_config=client_config or self.client_config,
|
|
@@ -1465,7 +1319,6 @@ class Catalog:
|
|
|
1465
1319
|
edatachain_file: Optional[str] = None,
|
|
1466
1320
|
*,
|
|
1467
1321
|
client_config=None,
|
|
1468
|
-
remote_config=None,
|
|
1469
1322
|
) -> None:
|
|
1470
1323
|
# TODO add progress bar https://github.com/iterative/dvcx/issues/750
|
|
1471
1324
|
# TODO copy correct remote dates https://github.com/iterative/dvcx/issues/new
|
|
@@ -1487,13 +1340,8 @@ class Catalog:
|
|
|
1487
1340
|
raise ValueError("Please provide output directory for instantiation")
|
|
1488
1341
|
|
|
1489
1342
|
client_config = client_config or self.client_config
|
|
1490
|
-
remote_config = remote_config or get_remote_config(
|
|
1491
|
-
read_config(DataChainDir.find().root), remote=""
|
|
1492
|
-
)
|
|
1493
1343
|
|
|
1494
|
-
studio_client = StudioClient(
|
|
1495
|
-
remote_config["url"], remote_config["username"], remote_config["token"]
|
|
1496
|
-
)
|
|
1344
|
+
studio_client = StudioClient()
|
|
1497
1345
|
|
|
1498
1346
|
try:
|
|
1499
1347
|
remote_dataset_name, version = parse_dataset_uri(dataset_uri)
|
|
@@ -1507,9 +1355,7 @@ class Catalog:
|
|
|
1507
1355
|
# we will create new one if it doesn't exist
|
|
1508
1356
|
pass
|
|
1509
1357
|
|
|
1510
|
-
remote_dataset = self.get_remote_dataset(
|
|
1511
|
-
remote_dataset_name, remote_config=remote_config
|
|
1512
|
-
)
|
|
1358
|
+
remote_dataset = self.get_remote_dataset(remote_dataset_name)
|
|
1513
1359
|
# if version is not specified in uri, take the latest one
|
|
1514
1360
|
if not version:
|
|
1515
1361
|
version = remote_dataset.latest_version
|
|
@@ -1534,7 +1380,7 @@ class Catalog:
|
|
|
1534
1380
|
|
|
1535
1381
|
stats_response = studio_client.dataset_stats(remote_dataset_name, version)
|
|
1536
1382
|
if not stats_response.ok:
|
|
1537
|
-
|
|
1383
|
+
raise_remote_error(stats_response.message)
|
|
1538
1384
|
dataset_stats = stats_response.data
|
|
1539
1385
|
|
|
1540
1386
|
dataset_save_progress_bar = tqdm(
|
|
@@ -1566,7 +1412,7 @@ class Catalog:
|
|
|
1566
1412
|
remote_dataset_name, version
|
|
1567
1413
|
)
|
|
1568
1414
|
if not export_response.ok:
|
|
1569
|
-
|
|
1415
|
+
raise_remote_error(export_response.message)
|
|
1570
1416
|
|
|
1571
1417
|
signed_urls = export_response.data
|
|
1572
1418
|
|
|
@@ -1580,7 +1426,6 @@ class Catalog:
|
|
|
1580
1426
|
rows_fetcher = DatasetRowsFetcher(
|
|
1581
1427
|
metastore,
|
|
1582
1428
|
warehouse,
|
|
1583
|
-
remote_config,
|
|
1584
1429
|
dataset.name,
|
|
1585
1430
|
version,
|
|
1586
1431
|
schema,
|
|
@@ -1623,7 +1468,6 @@ class Catalog:
|
|
|
1623
1468
|
no_cp: bool = False,
|
|
1624
1469
|
edatachain: bool = False,
|
|
1625
1470
|
edatachain_file: Optional[str] = None,
|
|
1626
|
-
ttl: int = TTL_INT,
|
|
1627
1471
|
*,
|
|
1628
1472
|
client_config=None,
|
|
1629
1473
|
) -> None:
|
|
@@ -1645,7 +1489,6 @@ class Catalog:
|
|
|
1645
1489
|
edatachain_only=no_cp,
|
|
1646
1490
|
no_edatachain_file=not edatachain,
|
|
1647
1491
|
edatachain_file=edatachain_file,
|
|
1648
|
-
ttl=ttl,
|
|
1649
1492
|
client_config=client_config,
|
|
1650
1493
|
)
|
|
1651
1494
|
else:
|
|
@@ -1653,7 +1496,6 @@ class Catalog:
|
|
|
1653
1496
|
# it needs to be done here
|
|
1654
1497
|
self.enlist_sources(
|
|
1655
1498
|
sources,
|
|
1656
|
-
ttl,
|
|
1657
1499
|
update,
|
|
1658
1500
|
client_config=client_config or self.client_config,
|
|
1659
1501
|
)
|
|
@@ -1713,7 +1555,6 @@ class Catalog:
|
|
|
1713
1555
|
edatachain_only: bool = False,
|
|
1714
1556
|
no_edatachain_file: bool = False,
|
|
1715
1557
|
no_glob: bool = False,
|
|
1716
|
-
ttl: int = TTL_INT,
|
|
1717
1558
|
*,
|
|
1718
1559
|
client_config=None,
|
|
1719
1560
|
) -> list[dict[str, Any]]:
|
|
@@ -1725,7 +1566,6 @@ class Catalog:
|
|
|
1725
1566
|
client_config = client_config or self.client_config
|
|
1726
1567
|
node_groups = self.enlist_sources_grouped(
|
|
1727
1568
|
sources,
|
|
1728
|
-
ttl,
|
|
1729
1569
|
update,
|
|
1730
1570
|
no_glob,
|
|
1731
1571
|
client_config=client_config,
|
|
@@ -1784,14 +1624,12 @@ class Catalog:
|
|
|
1784
1624
|
self,
|
|
1785
1625
|
sources,
|
|
1786
1626
|
depth=0,
|
|
1787
|
-
ttl=TTL_INT,
|
|
1788
1627
|
update=False,
|
|
1789
1628
|
*,
|
|
1790
1629
|
client_config=None,
|
|
1791
1630
|
) -> Iterable[tuple[str, float]]:
|
|
1792
1631
|
sources = self.enlist_sources(
|
|
1793
1632
|
sources,
|
|
1794
|
-
ttl,
|
|
1795
1633
|
update,
|
|
1796
1634
|
client_config=client_config or self.client_config,
|
|
1797
1635
|
)
|
|
@@ -1812,7 +1650,6 @@ class Catalog:
|
|
|
1812
1650
|
def find(
|
|
1813
1651
|
self,
|
|
1814
1652
|
sources,
|
|
1815
|
-
ttl=TTL_INT,
|
|
1816
1653
|
update=False,
|
|
1817
1654
|
names=None,
|
|
1818
1655
|
inames=None,
|
|
@@ -1826,7 +1663,6 @@ class Catalog:
|
|
|
1826
1663
|
) -> Iterator[str]:
|
|
1827
1664
|
sources = self.enlist_sources(
|
|
1828
1665
|
sources,
|
|
1829
|
-
ttl,
|
|
1830
1666
|
update,
|
|
1831
1667
|
client_config=client_config or self.client_config,
|
|
1832
1668
|
)
|
|
@@ -1862,7 +1698,6 @@ class Catalog:
|
|
|
1862
1698
|
def index(
|
|
1863
1699
|
self,
|
|
1864
1700
|
sources,
|
|
1865
|
-
ttl=TTL_INT,
|
|
1866
1701
|
update=False,
|
|
1867
1702
|
*,
|
|
1868
1703
|
client_config=None,
|
|
@@ -1888,7 +1723,6 @@ class Catalog:
|
|
|
1888
1723
|
|
|
1889
1724
|
self.enlist_sources(
|
|
1890
1725
|
non_root_sources,
|
|
1891
|
-
ttl,
|
|
1892
1726
|
update,
|
|
1893
1727
|
client_config=client_config,
|
|
1894
1728
|
only_index=True,
|