PyPI - datachain - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

datachain 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (38) hide show

datachain/__init__.py +2 -0
datachain/catalog/catalog.py +62 -228
datachain/cli.py +136 -22
datachain/client/fsspec.py +9 -0
datachain/client/local.py +11 -32
datachain/config.py +126 -51
datachain/data_storage/schema.py +66 -33
datachain/data_storage/sqlite.py +12 -4
datachain/data_storage/warehouse.py +101 -129
datachain/lib/convert/sql_to_python.py +8 -12
datachain/lib/dc.py +275 -80
datachain/lib/func/__init__.py +32 -0
datachain/lib/func/aggregate.py +353 -0
datachain/lib/func/func.py +152 -0
datachain/lib/listing.py +6 -21
datachain/lib/listing_info.py +4 -0
datachain/lib/signal_schema.py +17 -8
datachain/lib/udf.py +3 -3
datachain/lib/utils.py +5 -0
datachain/listing.py +22 -48
datachain/query/__init__.py +1 -2
datachain/query/batch.py +0 -1
datachain/query/dataset.py +33 -46
datachain/query/schema.py +1 -61
datachain/query/session.py +33 -25
datachain/remote/studio.py +63 -14
datachain/sql/functions/__init__.py +1 -1
datachain/sql/functions/aggregate.py +47 -0
datachain/sql/functions/array.py +0 -8
datachain/sql/sqlite/base.py +20 -2
datachain/studio.py +129 -0
datachain/utils.py +58 -0
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/METADATA +7 -6
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/RECORD +38 -33
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/WHEEL +1 -1
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/LICENSE +0 -0
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/entry_points.txt +0 -0
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/top_level.txt +0 -0

datachain/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from datachain.lib import func
 from datachain.lib.data_model import DataModel, DataType, is_chain_type
 from datachain.lib.dc import C, Column, DataChain, Sys
 from datachain.lib.file import (
@@ -34,6 +35,7 @@ __all__ = [
     "Sys",
     "TarVFile",
     "TextFile",
+    "func",
     "is_chain_type",
     "metrics",
     "param",

datachain/catalog/catalog.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import glob
 import io
 import json
 import logging
@@ -35,7 +34,6 @@ from tqdm import tqdm
 from datachain.cache import DataChainCache
 from datachain.client import Client
-from datachain.config import get_remote_config, read_config
 from datachain.dataset import (
     DATASET_PREFIX,
     QUERY_DATASET_PREFIX,
@@ -48,12 +46,10 @@ from datachain.dataset import (
     parse_dataset_uri,
 )
 from datachain.error import (
-    ClientError,
     DataChainError,
     DatasetInvalidVersionError,
     DatasetNotFoundError,
     DatasetVersionNotFoundError,
-    PendingIndexingError,
     QueryScriptCancelError,
     QueryScriptRunError,
 )
@@ -61,8 +57,8 @@ from datachain.listing import Listing
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
-from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
-from datachain.storage import Storage, StorageStatus, StorageURI
+from datachain.sql.types import DateTime, SQLType, String
+from datachain.storage import StorageURI
 from datachain.utils import (
     DataChainDir,
     batched,
@@ -102,7 +98,7 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1  # sleep time while waiting for chunk to be av
 PULL_DATASET_CHECK_STATUS_INTERVAL = 20  # interval to check export status in Studio
-def _raise_remote_error(error_message: str) -> NoReturn:
+def raise_remote_error(error_message: str) -> NoReturn:
     raise DataChainError(f"Error from server: {error_message}")
@@ -130,7 +126,6 @@ class DatasetRowsFetcher(NodesThreadPool):
         self,
         metastore: "AbstractMetastore",
         warehouse: "AbstractWarehouse",
-        remote_config: dict[str, Any],
         dataset_name: str,
         dataset_version: int,
         schema: dict[str, Union[SQLType, type[SQLType]]],
@@ -144,10 +139,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         self.dataset_version = dataset_version
         self.schema = schema
         self.last_status_check: Optional[float] = None
-        self.studio_client = StudioClient(
-            remote_config["url"], remote_config["username"], remote_config["token"]
-        )
+        self.studio_client = StudioClient()
     def done_task(self, done):
         for task in done:
@@ -181,14 +173,14 @@ class DatasetRowsFetcher(NodesThreadPool):
             self.dataset_name, self.dataset_version
         )
         if not export_status_response.ok:
-            _raise_remote_error(export_status_response.message)
+            raise_remote_error(export_status_response.message)
         export_status = export_status_response.data["status"]  # type: ignore [index]
         if export_status == "failed":
-            _raise_remote_error("Dataset export failed in Studio")
+            raise_remote_error("Dataset export failed in Studio")
         if export_status == "removed":
-            _raise_remote_error("Dataset export removed in Studio")
+            raise_remote_error("Dataset export removed in Studio")
         self.last_status_check = time.time()
@@ -483,17 +475,12 @@ def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
         if not node_group.sources:
             continue
         listing: Listing = node_group.listing
-        source_path: str = node_group.source_path
-        if not node_group.is_dataset:
-            assert listing.storage
-            data_source = listing.storage.to_dict(source_path)
-        else:
-            data_source = {"uri": listing.metastore.uri}
-        metafile_group = {"data-source": data_source, "files": []}
+        metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
         for node in node_group.instantiated_nodes:
             if not node.n.is_dir:
-                metafile_group["files"].append(node.get_metafile_data())
+                metafile_group["files"].append(  # type: ignore [attr-defined]
+                    node.get_metafile_data()
+                )
         if metafile_group["files"]:
             metafile_data.append(metafile_group)
@@ -569,6 +556,12 @@ class Catalog:
         return self._warehouse
+    @cached_property
+    def session(self):
+        from datachain.query.session import Session
+        return Session.get(catalog=self)
     def get_init_params(self) -> dict[str, Any]:
         return {
             **self._init_params,
@@ -599,162 +592,29 @@ class Catalog:
     def enlist_source(
         self,
         source: str,
-        ttl: int,
-        force_update=False,
-        skip_indexing=False,
+        update=False,
         client_config=None,
+        object_name="file",
+        skip_indexing=False,
     ) -> tuple[Listing, str]:
-        if force_update and skip_indexing:
-            raise ValueError(
-                "Both force_update and skip_indexing flags"
-                " cannot be True at the same time"
-            )
-        partial_id: Optional[int]
-        partial_path: Optional[str]
+        from datachain.lib.dc import DataChain
-        client_config = client_config or self.client_config
-        uri, path = Client.parse_url(source)
-        client = Client.get_client(source, self.cache, **client_config)
-        stem = os.path.basename(os.path.normpath(path))
-        prefix = (
-            posixpath.dirname(path)
-            if glob.has_magic(stem) or client.fs.isfile(source)
-            else path
+        DataChain.from_storage(
+            source, session=self.session, update=update, object_name=object_name
         )
-        storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
-        source_metastore = self.metastore.clone(uri)
-        columns = [
-            Column("path", String),
-            Column("etag", String),
-            Column("version", String),
-            Column("is_latest", Boolean),
-            Column("last_modified", DateTime(timezone=True)),
-            Column("size", Int64),
-            Column("location", JSON),
-            Column("source", String),
-        ]
-        if skip_indexing:
-            source_metastore.create_storage_if_not_registered(uri)
-            storage = source_metastore.get_storage(uri)
-            source_metastore.init_partial_id(uri)
-            partial_id = source_metastore.get_next_partial_id(uri)
-            source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
-            source_metastore.init(uri)
-            source_warehouse = self.warehouse.clone()
-            dataset = self.create_dataset(
-                storage_dataset_name, columns=columns, listing=True
-            )
-            return (
-                Listing(storage, source_metastore, source_warehouse, client, dataset),
-                path,
-            )
-        (
-            storage,
-            need_index,
-            in_progress,
-            partial_id,
-            partial_path,
-        ) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
-        if in_progress:
-            raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
-        if not need_index:
-            assert partial_id is not None
-            assert partial_path is not None
-            source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
-            source_warehouse = self.warehouse.clone()
-            dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
-            lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
-            logger.debug(
-                "Using cached listing %s. Valid till: %s",
-                storage.uri,
-                storage.expires_to_local,
-            )
-            # Listing has to have correct version of data storage
-            # initialized with correct Storage
-            self.update_dataset_version_with_warehouse_info(
-                dataset,
-                dataset.latest_version,
-            )
-            return lst, path
-        source_metastore.init_partial_id(uri)
-        partial_id = source_metastore.get_next_partial_id(uri)
-        source_metastore.init(uri)
-        source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
-        source_warehouse = self.warehouse.clone()
-        dataset = self.create_dataset(
-            storage_dataset_name, columns=columns, listing=True
+        list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
+            source, self.session, update=update
         )
-        lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
-        try:
-            lst.fetch(prefix)
-            source_metastore.mark_storage_indexed(
-                storage.uri,
-                StorageStatus.PARTIAL if prefix else StorageStatus.COMPLETE,
-                ttl,
-                prefix=prefix,
-                partial_id=partial_id,
-                dataset=dataset,
-            )
-            self.update_dataset_version_with_warehouse_info(
-                dataset,
-                dataset.latest_version,
-            )
-        except ClientError as e:
-            # for handling cloud errors
-            error_message = INDEX_INTERNAL_ERROR_MESSAGE
-            if e.error_code in ["InvalidAccessKeyId", "SignatureDoesNotMatch"]:
-                error_message = "Invalid cloud credentials"
-            source_metastore.mark_storage_indexed(
-                storage.uri,
-                StorageStatus.FAILED,
-                ttl,
-                prefix=prefix,
-                error_message=error_message,
-                error_stack=traceback.format_exc(),
-                dataset=dataset,
-            )
-            self._remove_dataset_rows_and_warehouse_info(
-                dataset, dataset.latest_version
-            )
-            raise
-        except:
-            source_metastore.mark_storage_indexed(
-                storage.uri,
-                StorageStatus.FAILED,
-                ttl,
-                prefix=prefix,
-                error_message=INDEX_INTERNAL_ERROR_MESSAGE,
-                error_stack=traceback.format_exc(),
-                dataset=dataset,
-            )
-            self._remove_dataset_rows_and_warehouse_info(
-                dataset, dataset.latest_version
-            )
-            raise
-        lst.storage = storage
+        lst = Listing(
+            self.warehouse.clone(),
+            Client.get_client(list_uri, self.cache, **self.client_config),
+            self.get_dataset(list_ds_name),
+            object_name=object_name,
+        )
-        return lst, path
+        return lst, list_path
     def _remove_dataset_rows_and_warehouse_info(
         self, dataset: DatasetRecord, version: int, **kwargs
@@ -770,7 +630,6 @@ class Catalog:
     def enlist_sources(
         self,
         sources: list[str],
-        ttl: int,
         update: bool,
         skip_indexing=False,
         client_config=None,
@@ -780,10 +639,9 @@ class Catalog:
         for src in sources:  # Opt: parallel
             listing, file_path = self.enlist_source(
                 src,
-                ttl,
                 update,
-                skip_indexing=skip_indexing,
                 client_config=client_config or self.client_config,
+                skip_indexing=skip_indexing,
             )
             enlisted_sources.append((listing, file_path))
@@ -802,7 +660,6 @@ class Catalog:
     def enlist_sources_grouped(
         self,
         sources: list[str],
-        ttl: int,
         update: bool,
         no_glob: bool = False,
         client_config=None,
@@ -823,7 +680,6 @@ class Catalog:
                 for ds in edatachain_data:
                     listing, source_path = self.enlist_source(
                         ds["data-source"]["uri"],
-                        ttl,
                         update,
                         client_config=client_config,
                     )
@@ -843,11 +699,13 @@ class Catalog:
                 )
                 indexed_sources = []
                 for source in dataset_sources:
+                    from datachain.lib.dc import DataChain
                     client = self.get_client(source, **client_config)
                     uri = client.uri
-                    ms = self.metastore.clone(uri, None)
                     st = self.warehouse.clone()
-                    listing = Listing(None, ms, st, client, None)
+                    dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
+                    listing = Listing(st, client, self.get_dataset(dataset_name))
                     rows = DatasetQuery(
                         name=dataset.name, version=ds_version, catalog=self
                     ).to_db_records()
@@ -864,7 +722,7 @@ class Catalog:
                 enlisted_sources.append((False, True, indexed_sources))
             else:
                 listing, source_path = self.enlist_source(
-                    src, ttl, update, client_config=client_config
+                    src, update, client_config=client_config
                 )
                 enlisted_sources.append((False, False, (listing, source_path)))
@@ -989,13 +847,6 @@ class Catalog:
             c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
         }
-        job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
-        if not job_id:
-            from datachain.query.session import Session
-            session = Session.get(catalog=self)
-            job_id = session.job_id
         dataset = self.metastore.create_dataset_version(
             dataset,
             version,
@@ -1122,19 +973,16 @@ class Catalog:
             raise ValueError("Sources needs to be non empty list")
         from datachain.lib.dc import DataChain
-        from datachain.query.session import Session
-        session = Session.get(catalog=self, client_config=client_config)
         chains = []
         for source in sources:
             if source.startswith(DATASET_PREFIX):
                 dc = DataChain.from_dataset(
-                    source[len(DATASET_PREFIX) :], session=session
+                    source[len(DATASET_PREFIX) :], session=self.session
                 )
             else:
                 dc = DataChain.from_storage(
-                    source, session=session, recursive=recursive
+                    source, session=self.session, recursive=recursive
                 )
             chains.append(dc)
@@ -1218,6 +1066,7 @@ class Catalog:
             preview=dataset_version.preview,
             job_id=dataset_version.job_id,
         )
         # to avoid re-creating rows table, we are just renaming it for a new version
         # of target dataset
         self.warehouse.rename_dataset_table(
@@ -1245,17 +1094,12 @@ class Catalog:
     def get_dataset(self, name: str) -> DatasetRecord:
         return self.metastore.get_dataset(name)
-    def get_remote_dataset(self, name: str, *, remote_config=None) -> DatasetRecord:
-        remote_config = remote_config or get_remote_config(
-            read_config(DataChainDir.find().root), remote=""
-        )
-        studio_client = StudioClient(
-            remote_config["url"], remote_config["username"], remote_config["token"]
-        )
+    def get_remote_dataset(self, name: str) -> DatasetRecord:
+        studio_client = StudioClient()
         info_response = studio_client.dataset_info(name)
         if not info_response.ok:
-            _raise_remote_error(info_response.message)
+            raise_remote_error(info_response.message)
         dataset_info = info_response.data
         assert isinstance(dataset_info, dict)
@@ -1312,6 +1156,20 @@ class Catalog:
                 for v in d.versions
             )
+    def listings(self):
+        """
+        Returns list of ListingInfo objects which are representing specific
+        storage listing datasets
+        """
+        from datachain.lib.listing import is_listing_dataset
+        from datachain.lib.listing_info import ListingInfo
+        return [
+            ListingInfo.from_models(d, v, j)
+            for d, v, j in self.list_datasets_versions(include_listing=True)
+            if is_listing_dataset(d.name)
+        ]
     def ls_dataset_rows(
         self, name: str, version: int, offset=None, limit=None
     ) -> list[dict]:
@@ -1325,8 +1183,6 @@ class Catalog:
         if offset:
             q = q.offset(offset)
-        q = q.order_by("sys__id")
         return q.to_db_records()
     def signed_url(self, source: str, path: str, client_config=None) -> str:
@@ -1438,7 +1294,6 @@ class Catalog:
         self,
         sources: list[str],
         fields: Iterable[str],
-        ttl=TTL_INT,
         update=False,
         skip_indexing=False,
         *,
@@ -1446,7 +1301,6 @@ class Catalog:
     ) -> Iterator[tuple[DataSource, Iterable[tuple]]]:
         data_sources = self.enlist_sources(
             sources,
-            ttl,
             update,
             skip_indexing=skip_indexing,
             client_config=client_config or self.client_config,
@@ -1465,7 +1319,6 @@ class Catalog:
         edatachain_file: Optional[str] = None,
         *,
         client_config=None,
-        remote_config=None,
     ) -> None:
         # TODO add progress bar https://github.com/iterative/dvcx/issues/750
         # TODO copy correct remote dates https://github.com/iterative/dvcx/issues/new
@@ -1487,13 +1340,8 @@ class Catalog:
             raise ValueError("Please provide output directory for instantiation")
         client_config = client_config or self.client_config
-        remote_config = remote_config or get_remote_config(
-            read_config(DataChainDir.find().root), remote=""
-        )
-        studio_client = StudioClient(
-            remote_config["url"], remote_config["username"], remote_config["token"]
-        )
+        studio_client = StudioClient()
         try:
             remote_dataset_name, version = parse_dataset_uri(dataset_uri)
@@ -1507,9 +1355,7 @@ class Catalog:
             # we will create new one if it doesn't exist
             pass
-        remote_dataset = self.get_remote_dataset(
-            remote_dataset_name, remote_config=remote_config
-        )
+        remote_dataset = self.get_remote_dataset(remote_dataset_name)
         # if version is not specified in uri, take the latest one
         if not version:
             version = remote_dataset.latest_version
@@ -1534,7 +1380,7 @@ class Catalog:
         stats_response = studio_client.dataset_stats(remote_dataset_name, version)
         if not stats_response.ok:
-            _raise_remote_error(stats_response.message)
+            raise_remote_error(stats_response.message)
         dataset_stats = stats_response.data
         dataset_save_progress_bar = tqdm(
@@ -1566,7 +1412,7 @@ class Catalog:
             remote_dataset_name, version
         )
         if not export_response.ok:
-            _raise_remote_error(export_response.message)
+            raise_remote_error(export_response.message)
         signed_urls = export_response.data
@@ -1580,7 +1426,6 @@ class Catalog:
                 rows_fetcher = DatasetRowsFetcher(
                     metastore,
                     warehouse,
-                    remote_config,
                     dataset.name,
                     version,
                     schema,
@@ -1623,7 +1468,6 @@ class Catalog:
         no_cp: bool = False,
         edatachain: bool = False,
         edatachain_file: Optional[str] = None,
-        ttl: int = TTL_INT,
         *,
         client_config=None,
     ) -> None:
@@ -1645,7 +1489,6 @@ class Catalog:
                 edatachain_only=no_cp,
                 no_edatachain_file=not edatachain,
                 edatachain_file=edatachain_file,
-                ttl=ttl,
                 client_config=client_config,
             )
         else:
@@ -1653,7 +1496,6 @@ class Catalog:
             # it needs to be done here
             self.enlist_sources(
                 sources,
-                ttl,
                 update,
                 client_config=client_config or self.client_config,
             )
@@ -1713,7 +1555,6 @@ class Catalog:
         edatachain_only: bool = False,
         no_edatachain_file: bool = False,
         no_glob: bool = False,
-        ttl: int = TTL_INT,
         *,
         client_config=None,
     ) -> list[dict[str, Any]]:
@@ -1725,7 +1566,6 @@ class Catalog:
         client_config = client_config or self.client_config
         node_groups = self.enlist_sources_grouped(
             sources,
-            ttl,
             update,
             no_glob,
             client_config=client_config,
@@ -1784,14 +1624,12 @@ class Catalog:
         self,
         sources,
         depth=0,
-        ttl=TTL_INT,
         update=False,
         *,
         client_config=None,
     ) -> Iterable[tuple[str, float]]:
         sources = self.enlist_sources(
             sources,
-            ttl,
             update,
             client_config=client_config or self.client_config,
         )
@@ -1812,7 +1650,6 @@ class Catalog:
     def find(
         self,
         sources,
-        ttl=TTL_INT,
         update=False,
         names=None,
         inames=None,
@@ -1826,7 +1663,6 @@ class Catalog:
     ) -> Iterator[str]:
         sources = self.enlist_sources(
             sources,
-            ttl,
             update,
             client_config=client_config or self.client_config,
         )
@@ -1862,7 +1698,6 @@ class Catalog:
     def index(
         self,
         sources,
-        ttl=TTL_INT,
         update=False,
         *,
         client_config=None,
@@ -1888,7 +1723,6 @@ class Catalog:
         self.enlist_sources(
             non_root_sources,
-            ttl,
             update,
             client_config=client_config,
             only_index=True,

datachain 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

Potentially problematic release.

datachain 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl