PyPI - datachain - Versions diffs - 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl - Mend

datachain 0.7.1py3-none-any.whl → 0.7.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (46) hide show

datachain/__init__.py +0 -2
datachain/catalog/catalog.py +12 -9
datachain/cli.py +109 -9
datachain/client/fsspec.py +9 -9
datachain/data_storage/metastore.py +63 -11
datachain/data_storage/schema.py +2 -2
datachain/data_storage/sqlite.py +5 -4
datachain/data_storage/warehouse.py +18 -18
datachain/dataset.py +142 -14
datachain/func/__init__.py +49 -0
datachain/{lib/func → func}/aggregate.py +13 -11
datachain/func/array.py +176 -0
datachain/func/base.py +23 -0
datachain/func/conditional.py +81 -0
datachain/func/func.py +384 -0
datachain/func/path.py +110 -0
datachain/func/random.py +23 -0
datachain/func/string.py +154 -0
datachain/func/window.py +49 -0
datachain/lib/arrow.py +24 -12
datachain/lib/data_model.py +25 -9
datachain/lib/dataset_info.py +9 -5
datachain/lib/dc.py +94 -56
datachain/lib/hf.py +1 -1
datachain/lib/signal_schema.py +1 -1
datachain/lib/utils.py +1 -0
datachain/lib/webdataset_laion.py +5 -5
datachain/model/bbox.py +2 -2
datachain/model/pose.py +5 -5
datachain/model/segment.py +2 -2
datachain/nodes_fetcher.py +2 -2
datachain/query/dataset.py +57 -34
datachain/remote/studio.py +40 -8
datachain/sql/__init__.py +0 -2
datachain/sql/functions/__init__.py +0 -26
datachain/sql/selectable.py +11 -5
datachain/sql/sqlite/base.py +11 -2
datachain/studio.py +29 -0
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/METADATA +2 -2
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/RECORD +44 -37
datachain/lib/func/__init__.py +0 -32
datachain/lib/func/func.py +0 -152
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/LICENSE +0 -0
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/WHEEL +0 -0
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/entry_points.txt +0 -0
{datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/top_level.txt +0 -0

datachain/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from datachain.lib import func
 from datachain.lib.data_model import DataModel, DataType, is_chain_type
 from datachain.lib.dc import C, Column, DataChain, Sys
 from datachain.lib.file import (
@@ -35,7 +34,6 @@ __all__ = [
     "Sys",
     "TarVFile",
     "TextFile",
-    "func",
     "is_chain_type",
     "metrics",
     "param",

datachain/catalog/catalog.py CHANGED Viewed

@@ -38,6 +38,7 @@ from datachain.dataset import (
     DATASET_PREFIX,
     QUERY_DATASET_PREFIX,
     DatasetDependency,
+    DatasetListRecord,
     DatasetRecord,
     DatasetStats,
     DatasetStatus,
@@ -54,7 +55,6 @@ from datachain.error import (
     QueryScriptCancelError,
     QueryScriptRunError,
 )
-from datachain.listing import Listing
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
@@ -73,9 +73,10 @@ if TYPE_CHECKING:
         AbstractMetastore,
         AbstractWarehouse,
     )
-    from datachain.dataset import DatasetVersion
+    from datachain.dataset import DatasetListVersion
     from datachain.job import Job
     from datachain.lib.file import File
+    from datachain.listing import Listing
 logger = logging.getLogger("datachain")
@@ -236,7 +237,7 @@ class DatasetRowsFetcher(NodesThreadPool):
 class NodeGroup:
     """Class for a group of nodes from the same source"""
-    listing: Listing
+    listing: "Listing"
     sources: list[DataSource]
     # The source path within the bucket
@@ -591,8 +592,9 @@ class Catalog:
         client_config=None,
         object_name="file",
         skip_indexing=False,
-    ) -> tuple[Listing, str]:
+    ) -> tuple["Listing", str]:
         from datachain.lib.dc import DataChain
+        from datachain.listing import Listing
         DataChain.from_storage(
             source, session=self.session, update=update, object_name=object_name
@@ -660,7 +662,8 @@ class Catalog:
         no_glob: bool = False,
         client_config=None,
     ) -> list[NodeGroup]:
-        from datachain.query import DatasetQuery
+        from datachain.listing import Listing
+        from datachain.query.dataset import DatasetQuery
         def _row_to_node(d: dict[str, Any]) -> Node:
             del d["file__source"]
@@ -876,7 +879,7 @@ class Catalog:
     def update_dataset_version_with_warehouse_info(
         self, dataset: DatasetRecord, version: int, rows_dropped=False, **kwargs
     ) -> None:
-        from datachain.query import DatasetQuery
+        from datachain.query.dataset import DatasetQuery
         dataset_version = dataset.get_version(version)
@@ -1133,7 +1136,7 @@ class Catalog:
         return direct_dependencies
-    def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
+    def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetListRecord]:
         datasets = self.metastore.list_datasets()
         for d in datasets:
             if not d.is_bucket_listing or include_listing:
@@ -1142,7 +1145,7 @@ class Catalog:
     def list_datasets_versions(
         self,
         include_listing: bool = False,
-    ) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
+    ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
         """Iterate over all dataset versions with related jobs."""
         datasets = list(self.ls_datasets(include_listing=include_listing))
@@ -1177,7 +1180,7 @@ class Catalog:
     def ls_dataset_rows(
         self, name: str, version: int, offset=None, limit=None
     ) -> list[dict]:
-        from datachain.query import DatasetQuery
+        from datachain.query.dataset import DatasetQuery
         dataset = self.get_dataset(name)

datachain/cli.py CHANGED Viewed

@@ -18,7 +18,12 @@ from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyVa
 from datachain.config import Config
 from datachain.error import DataChainError
 from datachain.lib.dc import DataChain
-from datachain.studio import list_datasets, process_studio_cli_args
+from datachain.studio import (
+    edit_studio_dataset,
+    list_datasets,
+    process_studio_cli_args,
+    remove_studio_dataset,
+)
 from datachain.telemetry import telemetry
 if TYPE_CHECKING:
@@ -403,21 +408,44 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     parse_edit_dataset.add_argument(
         "--new-name",
         action="store",
-        default="",
         help="Dataset new name",
     )
     parse_edit_dataset.add_argument(
         "--description",
         action="store",
-        default="",
         help="Dataset description",
     )
     parse_edit_dataset.add_argument(
         "--labels",
-        default=[],
         nargs="+",
         help="Dataset labels",
     )
+    parse_edit_dataset.add_argument(
+        "--studio",
+        action="store_true",
+        default=False,
+        help="Edit dataset from Studio",
+    )
+    parse_edit_dataset.add_argument(
+        "-L",
+        "--local",
+        action="store_true",
+        default=False,
+        help="Edit local dataset only",
+    )
+    parse_edit_dataset.add_argument(
+        "-a",
+        "--all",
+        action="store_true",
+        default=True,
+        help="Edit both datasets from studio and local",
+    )
+    parse_edit_dataset.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to edit a dataset. By default, it will use team from config.",
+    )
     datasets_parser = subp.add_parser(
         "datasets", parents=[parent_parser], description="List datasets"
@@ -466,6 +494,32 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         action=BooleanOptionalAction,
         help="Force delete registered dataset with all of it's versions",
     )
+    rm_dataset_parser.add_argument(
+        "--studio",
+        action="store_true",
+        default=False,
+        help="Remove dataset from Studio",
+    )
+    rm_dataset_parser.add_argument(
+        "-L",
+        "--local",
+        action="store_true",
+        default=False,
+        help="Remove local datasets only",
+    )
+    rm_dataset_parser.add_argument(
+        "-a",
+        "--all",
+        action="store_true",
+        default=True,
+        help="Remove both local and studio",
+    )
+    rm_dataset_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to delete a dataset. By default, it will use team from config.",
+    )
     dataset_stats_parser = subp.add_parser(
         "dataset-stats",
@@ -909,8 +963,40 @@ def rm_dataset(
     name: str,
     version: Optional[int] = None,
     force: Optional[bool] = False,
+    studio: bool = False,
+    local: bool = False,
+    all: bool = True,
+    team: Optional[str] = None,
+):
+    token = Config().read().get("studio", {}).get("token")
+    all, local, studio = _determine_flavors(studio, local, all, token)
+    if all or local:
+        catalog.remove_dataset(name, version=version, force=force)
+    if (all or studio) and token:
+        remove_studio_dataset(team, name, version, force)
+def edit_dataset(
+    catalog: "Catalog",
+    name: str,
+    new_name: Optional[str] = None,
+    description: Optional[str] = None,
+    labels: Optional[list[str]] = None,
+    studio: bool = False,
+    local: bool = False,
+    all: bool = True,
+    team: Optional[str] = None,
 ):
-    catalog.remove_dataset(name, version=version, force=force)
+    token = Config().read().get("studio", {}).get("token")
+    all, local, studio = _determine_flavors(studio, local, all, token)
+    if all or local:
+        catalog.edit_dataset(name, new_name, description, labels)
+    if (all or studio) and token:
+        edit_studio_dataset(team, name, new_name, description, labels)
 def dataset_stats(
@@ -957,7 +1043,7 @@ def show(
     schema: bool = False,
 ) -> None:
     from datachain.lib.dc import DataChain
-    from datachain.query import DatasetQuery
+    from datachain.query.dataset import DatasetQuery
     from datachain.utils import show_records
     dataset = catalog.get_dataset(name)
@@ -1127,11 +1213,16 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 edatachain_file=args.edatachain_file,
             )
         elif args.command == "edit-dataset":
-            catalog.edit_dataset(
+            edit_dataset(
+                catalog,
                 args.name,
-                description=args.description,
                 new_name=args.new_name,
+                description=args.description,
                 labels=args.labels,
+                studio=args.studio,
+                local=args.local,
+                all=args.all,
+                team=args.team,
             )
         elif args.command == "ls":
             ls(
@@ -1164,7 +1255,16 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 schema=args.schema,
             )
         elif args.command == "rm-dataset":
-            rm_dataset(catalog, args.name, version=args.version, force=args.force)
+            rm_dataset(
+                catalog,
+                args.name,
+                version=args.version,
+                force=args.force,
+                studio=args.studio,
+                local=args.local,
+                all=args.all,
+                team=args.team,
+            )
         elif args.command == "dataset-stats":
             dataset_stats(
                 catalog,

datachain/client/fsspec.py CHANGED Viewed

@@ -28,7 +28,6 @@ from tqdm import tqdm
 from datachain.cache import DataChainCache
 from datachain.client.fileslice import FileWrapper
 from datachain.error import ClientError as DataChainClientError
-from datachain.lib.file import File
 from datachain.nodes_fetcher import NodesFetcher
 from datachain.nodes_thread_pool import NodeChunk
@@ -36,6 +35,7 @@ if TYPE_CHECKING:
     from fsspec.spec import AbstractFileSystem
     from datachain.dataset import StorageURI
+    from datachain.lib.file import File
 logger = logging.getLogger("datachain")
@@ -45,7 +45,7 @@ DELIMITER = "/"  # Path delimiter.
 DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
-ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
+ResultQueue = asyncio.Queue[Optional[Sequence["File"]]]
 def _is_win_local_path(uri: str) -> bool:
@@ -212,7 +212,7 @@ class Client(ABC):
     async def scandir(
         self, start_prefix: str, method: str = "default"
-    ) -> AsyncIterator[Sequence[File]]:
+    ) -> AsyncIterator[Sequence["File"]]:
         try:
             impl = getattr(self, f"_fetch_{method}")
         except AttributeError:
@@ -317,7 +317,7 @@ class Client(ABC):
         return f"{self.PREFIX}{self.name}/{rel_path}"
     @abstractmethod
-    def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
+    def info_to_file(self, v: dict[str, Any], parent: str) -> "File": ...
     def fetch_nodes(
         self,
@@ -354,7 +354,7 @@ class Client(ABC):
             copy2(src, dst)
     def open_object(
-        self, file: File, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
+        self, file: "File", use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
     ) -> BinaryIO:
         """Open a file, including files in tar archives."""
         if use_cache and (cache_path := self.cache.get_path(file)):
@@ -362,19 +362,19 @@ class Client(ABC):
         assert not file.location
         return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb)  # type: ignore[return-value]
-    def download(self, file: File, *, callback: Callback = DEFAULT_CALLBACK) -> None:
+    def download(self, file: "File", *, callback: Callback = DEFAULT_CALLBACK) -> None:
         sync(get_loop(), functools.partial(self._download, file, callback=callback))
-    async def _download(self, file: File, *, callback: "Callback" = None) -> None:
+    async def _download(self, file: "File", *, callback: "Callback" = None) -> None:
         if self.cache.contains(file):
             # Already in cache, so there's nothing to do.
             return
         await self._put_in_cache(file, callback=callback)
-    def put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
+    def put_in_cache(self, file: "File", *, callback: "Callback" = None) -> None:
         sync(get_loop(), functools.partial(self._put_in_cache, file, callback=callback))
-    async def _put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
+    async def _put_in_cache(self, file: "File", *, callback: "Callback" = None) -> None:
         assert not file.location
         if file.etag:
             etag = await self.get_current_etag(file)

datachain/data_storage/metastore.py CHANGED Viewed

@@ -27,6 +27,8 @@ from datachain.data_storage import JobQueryType, JobStatus
 from datachain.data_storage.serializer import Serializable
 from datachain.dataset import (
     DatasetDependency,
+    DatasetListRecord,
+    DatasetListVersion,
     DatasetRecord,
     DatasetStatus,
     DatasetVersion,
@@ -59,6 +61,8 @@ class AbstractMetastore(ABC, Serializable):
     schema: "schema.Schema"
     dataset_class: type[DatasetRecord] = DatasetRecord
+    dataset_list_class: type[DatasetListRecord] = DatasetListRecord
+    dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
     dependency_class: type[DatasetDependency] = DatasetDependency
     job_class: type[Job] = Job
@@ -166,11 +170,11 @@ class AbstractMetastore(ABC, Serializable):
         """
     @abstractmethod
-    def list_datasets(self) -> Iterator[DatasetRecord]:
+    def list_datasets(self) -> Iterator[DatasetListRecord]:
         """Lists all datasets."""
     @abstractmethod
-    def list_datasets_by_prefix(self, prefix: str) -> Iterator["DatasetRecord"]:
+    def list_datasets_by_prefix(self, prefix: str) -> Iterator["DatasetListRecord"]:
         """Lists all datasets which names start with prefix."""
     @abstractmethod
@@ -348,6 +352,14 @@ class AbstractDBMetastore(AbstractMetastore):
             if c.name  # type: ignore [attr-defined]
         ]
+    @cached_property
+    def _dataset_list_fields(self) -> list[str]:
+        return [
+            c.name  # type: ignore [attr-defined]
+            for c in self._datasets_columns()
+            if c.name in self.dataset_list_class.__dataclass_fields__  # type: ignore [attr-defined]
+        ]
     @classmethod
     def _datasets_versions_columns(cls) -> list["SchemaItem"]:
         """Datasets versions table columns."""
@@ -390,6 +402,15 @@ class AbstractDBMetastore(AbstractMetastore):
             if c.name  # type: ignore [attr-defined]
         ]
+    @cached_property
+    def _dataset_list_version_fields(self) -> list[str]:
+        return [
+            c.name  # type: ignore [attr-defined]
+            for c in self._datasets_versions_columns()
+            if c.name  # type: ignore [attr-defined]
+            in self.dataset_list_version_class.__dataclass_fields__
+        ]
     @classmethod
     def _datasets_dependencies_columns(cls) -> list["SchemaItem"]:
         """Datasets dependencies table columns."""
@@ -671,7 +692,25 @@ class AbstractDBMetastore(AbstractMetastore):
             if dataset:
                 yield dataset
-    def _base_dataset_query(self):
+    def _parse_list_dataset(self, rows) -> Optional[DatasetListRecord]:
+        versions = [self.dataset_list_class.parse(*r) for r in rows]
+        if not versions:
+            return None
+        return reduce(lambda ds, version: ds.merge_versions(version), versions)
+    def _parse_dataset_list(self, rows) -> Iterator["DatasetListRecord"]:
+        # grouping rows by dataset id
+        for _, g in groupby(rows, lambda r: r[0]):
+            dataset = self._parse_list_dataset(list(g))
+            if dataset:
+                yield dataset
+    def _get_dataset_query(
+        self,
+        dataset_fields: list[str],
+        dataset_version_fields: list[str],
+        isouter: bool = True,
+    ):
         if not (
             self.db.has_table(self._datasets.name)
             and self.db.has_table(self._datasets_versions.name)
@@ -680,23 +719,36 @@ class AbstractDBMetastore(AbstractMetastore):
         d = self._datasets
         dv = self._datasets_versions
         query = self._datasets_select(
-            *(getattr(d.c, f) for f in self._dataset_fields),
-            *(getattr(dv.c, f) for f in self._dataset_version_fields),
+            *(getattr(d.c, f) for f in dataset_fields),
+            *(getattr(dv.c, f) for f in dataset_version_fields),
         )
-        j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=True)
+        j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
         return query.select_from(j)
-    def list_datasets(self) -> Iterator["DatasetRecord"]:
+    def _base_dataset_query(self):
+        return self._get_dataset_query(
+            self._dataset_fields, self._dataset_version_fields
+        )
+    def _base_list_datasets_query(self):
+        return self._get_dataset_query(
+            self._dataset_list_fields, self._dataset_list_version_fields, isouter=False
+        )
+    def list_datasets(self) -> Iterator["DatasetListRecord"]:
         """Lists all datasets."""
-        yield from self._parse_datasets(self.db.execute(self._base_dataset_query()))
+        yield from self._parse_dataset_list(
+            self.db.execute(self._base_list_datasets_query())
+        )
     def list_datasets_by_prefix(
         self, prefix: str, conn=None
-    ) -> Iterator["DatasetRecord"]:
-        query = self._base_dataset_query()
+    ) -> Iterator["DatasetListRecord"]:
+        query = self._base_list_datasets_query()
         query = query.where(self._datasets.c.name.startswith(prefix))
-        yield from self._parse_datasets(self.db.execute(query))
+        yield from self._parse_dataset_list(self.db.execute(query))
     def get_dataset(self, name: str, conn=None) -> DatasetRecord:
         """Gets a single dataset by name"""

datachain/data_storage/schema.py CHANGED Viewed

@@ -12,7 +12,7 @@ import sqlalchemy as sa
 from sqlalchemy.sql import func as f
 from sqlalchemy.sql.expression import false, null, true
-from datachain.sql.functions import path
+from datachain.sql.functions import path as pathfunc
 from datachain.sql.types import Int, SQLType, UInt64
 if TYPE_CHECKING:
@@ -130,7 +130,7 @@ class DirExpansion:
     def query(self, q):
         q = self.base_select(q).cte(recursive=True)
-        parent = path.parent(self.c(q, "path"))
+        parent = pathfunc.parent(self.c(q, "path"))
         q = q.union_all(
             sa.select(
                 sa.literal(-1).label("sys__id"),

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -122,7 +122,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
         return cls(*cls._connect(db_file=db_file))
     @staticmethod
-    def _connect(db_file: Optional[str] = None):
+    def _connect(
+        db_file: Optional[str] = None,
+    ) -> tuple["Engine", "MetaData", sqlite3.Connection, str]:
         try:
             if db_file == ":memory:":
                 # Enable multithreaded usage of the same in-memory db
@@ -130,9 +132,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
                     _get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
                 )
             else:
-                db = sqlite3.connect(
-                    db_file or DataChainDir.find().db, detect_types=DETECT_TYPES
-                )
+                db_file = db_file or DataChainDir.find().db
+                db = sqlite3.connect(db_file, detect_types=DETECT_TYPES)
             create_user_defined_sql_functions(db)
             engine = sqlalchemy.create_engine(
                 "sqlite+pysqlite:///", creator=lambda: db, future=True

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -224,28 +224,28 @@ class AbstractWarehouse(ABC, Serializable):
         offset = 0
         num_yielded = 0
-        while True:
-            if limit is not None:
-                limit -= num_yielded
-                if limit == 0:
-                    break
-                if limit < page_size:
-                    paginated_query = paginated_query.limit(None).limit(limit)
-            # Ensure we're using a thread-local connection
-            with self.clone() as wh:
+        # Ensure we're using a thread-local connection
+        with self.clone() as wh:
+            while True:
+                if limit is not None:
+                    limit -= num_yielded
+                    if limit == 0:
+                        break
+                    if limit < page_size:
+                        paginated_query = paginated_query.limit(None).limit(limit)
                 # Cursor results are not thread-safe, so we convert them to a list
                 results = list(wh.dataset_rows_select(paginated_query.offset(offset)))
-            processed = False
-            for row in results:
-                processed = True
-                yield row
-                num_yielded += 1
+                processed = False
+                for row in results:
+                    processed = True
+                    yield row
+                    num_yielded += 1
-            if not processed:
-                break  # no more results
-            offset += page_size
+                if not processed:
+                    break  # no more results
+                offset += page_size
     #
     # Table Name Internal Functions

datachain 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl

Potentially problematic release.

datachain 0.7.1py3-none-any.whl → 0.7.3py3-none-any.whl