PyPI - datachain - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

datachain 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (38) hide show

datachain/__init__.py +2 -0
datachain/catalog/catalog.py +62 -228
datachain/cli.py +136 -22
datachain/client/fsspec.py +9 -0
datachain/client/local.py +11 -32
datachain/config.py +126 -51
datachain/data_storage/schema.py +66 -33
datachain/data_storage/sqlite.py +12 -4
datachain/data_storage/warehouse.py +101 -129
datachain/lib/convert/sql_to_python.py +8 -12
datachain/lib/dc.py +275 -80
datachain/lib/func/__init__.py +32 -0
datachain/lib/func/aggregate.py +353 -0
datachain/lib/func/func.py +152 -0
datachain/lib/listing.py +6 -21
datachain/lib/listing_info.py +4 -0
datachain/lib/signal_schema.py +17 -8
datachain/lib/udf.py +3 -3
datachain/lib/utils.py +5 -0
datachain/listing.py +22 -48
datachain/query/__init__.py +1 -2
datachain/query/batch.py +0 -1
datachain/query/dataset.py +33 -46
datachain/query/schema.py +1 -61
datachain/query/session.py +33 -25
datachain/remote/studio.py +63 -14
datachain/sql/functions/__init__.py +1 -1
datachain/sql/functions/aggregate.py +47 -0
datachain/sql/functions/array.py +0 -8
datachain/sql/sqlite/base.py +20 -2
datachain/studio.py +129 -0
datachain/utils.py +58 -0
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/METADATA +7 -6
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/RECORD +38 -33
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/WHEEL +1 -1
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/LICENSE +0 -0
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/entry_points.txt +0 -0
{datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/top_level.txt +0 -0

datachain/listing.py CHANGED Viewed

@@ -4,12 +4,10 @@ from collections.abc import Iterable, Iterator
 from itertools import zip_longest
 from typing import TYPE_CHECKING, Optional
-from fsspec.asyn import get_loop, sync
 from sqlalchemy import Column
 from sqlalchemy.sql import func
 from tqdm import tqdm
-from datachain.lib.file import File
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.sql.functions import path as pathfunc
 from datachain.utils import suffix_to_number
@@ -17,33 +15,29 @@ from datachain.utils import suffix_to_number
 if TYPE_CHECKING:
     from datachain.catalog.datasource import DataSource
     from datachain.client import Client
-    from datachain.data_storage import AbstractMetastore, AbstractWarehouse
+    from datachain.data_storage import AbstractWarehouse
     from datachain.dataset import DatasetRecord
-    from datachain.storage import Storage
 class Listing:
     def __init__(
         self,
-        storage: Optional["Storage"],
-        metastore: "AbstractMetastore",
         warehouse: "AbstractWarehouse",
         client: "Client",
         dataset: Optional["DatasetRecord"],
+        object_name: str = "file",
     ):
-        self.storage = storage
-        self.metastore = metastore
         self.warehouse = warehouse
         self.client = client
         self.dataset = dataset  # dataset representing bucket listing
+        self.object_name = object_name
     def clone(self) -> "Listing":
         return self.__class__(
-            self.storage,
-            self.metastore.clone(),
             self.warehouse.clone(),
             self.client,
             self.dataset,
+            self.object_name,
         )
     def __enter__(self) -> "Listing":
@@ -53,46 +47,20 @@ class Listing:
         self.close()
     def close(self) -> None:
-        self.metastore.close()
         self.warehouse.close()
     @property
-    def id(self):
-        return self.storage.id
+    def uri(self):
+        from datachain.lib.listing import listing_uri_from_name
+        return listing_uri_from_name(self.dataset.name)
     @property
     def dataset_rows(self):
-        return self.warehouse.dataset_rows(self.dataset, self.dataset.latest_version)
-    def fetch(self, start_prefix="", method: str = "default") -> None:
-        sync(get_loop(), self._fetch, start_prefix, method)
-    async def _fetch(self, start_prefix: str, method: str) -> None:
-        with self.clone() as fetch_listing:
-            if start_prefix:
-                start_prefix = start_prefix.rstrip("/")
-            try:
-                async for entries in fetch_listing.client.scandir(
-                    start_prefix, method=method
-                ):
-                    fetch_listing.insert_entries(entries)
-                    if len(entries) > 1:
-                        fetch_listing.metastore.update_last_inserted_at()
-            finally:
-                fetch_listing.insert_entries_done()
-    def insert_entry(self, entry: File) -> None:
-        self.insert_entries([entry])
-    def insert_entries(self, entries: Iterable[File]) -> None:
-        self.warehouse.insert_rows(
-            self.dataset_rows.get_table(),
-            self.warehouse.prepare_entries(entries),
+        return self.warehouse.dataset_rows(
+            self.dataset, self.dataset.latest_version, object_name=self.object_name
         )
-    def insert_entries_done(self) -> None:
-        self.warehouse.insert_rows_done(self.dataset_rows.get_table())
     def expand_path(self, path, use_glob=True) -> list[Node]:
         if use_glob and glob.has_magic(path):
             return self.warehouse.expand_path(self.dataset_rows, path)
@@ -200,25 +168,31 @@ class Listing:
         conds = []
         if names:
             for name in names:
-                conds.append(pathfunc.name(Column("path")).op("GLOB")(name))
+                conds.append(
+                    pathfunc.name(Column(dr.col_name("path"))).op("GLOB")(name)
+                )
         if inames:
             for iname in inames:
                 conds.append(
-                    func.lower(pathfunc.name(Column("path"))).op("GLOB")(iname.lower())
+                    func.lower(pathfunc.name(Column(dr.col_name("path")))).op("GLOB")(
+                        iname.lower()
+                    )
                 )
         if paths:
             for path in paths:
-                conds.append(Column("path").op("GLOB")(path))
+                conds.append(Column(dr.col_name("path")).op("GLOB")(path))
         if ipaths:
             for ipath in ipaths:
-                conds.append(func.lower(Column("path")).op("GLOB")(ipath.lower()))
+                conds.append(
+                    func.lower(Column(dr.col_name("path"))).op("GLOB")(ipath.lower())
+                )
         if size is not None:
             size_limit = suffix_to_number(size)
             if size_limit >= 0:
-                conds.append(Column("size") >= size_limit)
+                conds.append(Column(dr.col_name("size")) >= size_limit)
             else:
-                conds.append(Column("size") <= -size_limit)
+                conds.append(Column(dr.col_name("size")) <= -size_limit)
         return self.warehouse.find(
             dr,

datachain/query/__init__.py CHANGED Viewed

@@ -1,12 +1,11 @@
 from .dataset import DatasetQuery
 from .params import param
-from .schema import C, DatasetRow, LocalFilename, Object, Stream
+from .schema import C, LocalFilename, Object, Stream
 from .session import Session
 __all__ = [
     "C",
     "DatasetQuery",
-    "DatasetRow",
     "LocalFilename",
     "Object",
     "Session",

datachain/query/batch.py CHANGED Viewed

@@ -97,7 +97,6 @@ class Partition(BatchingStrategy):
         ordered_query = query.order_by(None).order_by(
             PARTITION_COLUMN_ID,
-            "sys__id",
             *query._order_by_clauses,
         )

datachain/query/dataset.py CHANGED Viewed

@@ -10,6 +10,7 @@ from abc import ABC, abstractmethod
 from collections.abc import Generator, Iterable, Iterator, Sequence
 from copy import copy
 from functools import wraps
+from secrets import token_hex
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -173,10 +174,10 @@ class QueryStep(StartingStep):
             return sqlalchemy.select(*columns)
         dataset = self.catalog.get_dataset(self.dataset_name)
-        table = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
+        dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
         return step_result(
-            q, table.c, dependencies=[(self.dataset_name, self.dataset_version)]
+            q, dr.columns, dependencies=[(self.dataset_name, self.dataset_version)]
         )
@@ -591,10 +592,6 @@ class UDFSignal(UDFStep):
             return query, []
         table = self.catalog.warehouse.create_pre_udf_table(query)
         q: Select = sqlalchemy.select(*table.c)
-        if query._order_by_clauses:
-            # we are adding ordering only if it's explicitly added by user in
-            # query part before adding signals
-            q = q.order_by(table.c.sys__id)
         return q, [table]
     def create_result_query(
@@ -630,11 +627,6 @@ class UDFSignal(UDFStep):
             else:
                 res = sqlalchemy.select(*cols1).select_from(subq)
-            if query._order_by_clauses:
-                # if ordering is used in query part before adding signals, we
-                # will have it as order by id from select from pre-created udf table
-                res = res.order_by(subq.c.sys__id)
             if self.partition_by is not None:
                 subquery = res.subquery()
                 res = sqlalchemy.select(*subquery.c).select_from(subquery)
@@ -666,13 +658,6 @@ class RowGenerator(UDFStep):
     def create_result_query(
         self, udf_table, query: Select
     ) -> tuple[QueryGeneratorFunc, list["sqlalchemy.Column"]]:
-        if not query._order_by_clauses:
-            # if we are not selecting all rows in UDF, we need to ensure that
-            # we get the same rows as we got as inputs of UDF since selecting
-            # without ordering can be non deterministic in some databases
-            c = query.selected_columns
-            query = query.order_by(c.sys__id)
         udf_table_query = udf_table.select().subquery()
         udf_table_cols: list[sqlalchemy.Label[Any]] = [
             label(c.name, c) for c in udf_table_query.columns
@@ -736,10 +721,17 @@ class SQLMutate(SQLClause):
     def apply_sql_clause(self, query: Select) -> Select:
         original_subquery = query.subquery()
+        to_mutate = {c.name for c in self.args}
+        prefix = f"mutate{token_hex(8)}_"
+        cols = [
+            c.label(prefix + c.name) if c.name in to_mutate else c
+            for c in original_subquery.c
+        ]
         # this is needed for new column to be used in clauses
         # like ORDER BY, otherwise new column is not recognized
         subquery = (
-            sqlalchemy.select(*original_subquery.c, *self.args)
+            sqlalchemy.select(*cols, *self.args)
             .select_from(original_subquery)
             .subquery()
         )
@@ -957,24 +949,24 @@ class SQLJoin(Step):
 @frozen
-class GroupBy(Step):
-    """Group rows by a specific column."""
-    cols: PartitionByType
+class SQLGroupBy(SQLClause):
+    cols: Sequence[Union[str, ColumnElement]]
+    group_by: Sequence[Union[str, ColumnElement]]
-    def clone(self) -> "Self":
-        return self.__class__(self.cols)
+    def apply_sql_clause(self, query) -> Select:
+        if not self.cols:
+            raise ValueError("No columns to select")
+        if not self.group_by:
+            raise ValueError("No columns to group by")
-    def apply(
-        self, query_generator: QueryGenerator, temp_tables: list[str]
-    ) -> StepResult:
-        query = query_generator.select()
-        grouped_query = query.group_by(*self.cols)
+        subquery = query.subquery()
-        def q(*columns):
-            return grouped_query.with_only_columns(*columns)
+        cols = [
+            subquery.c[str(c)] if isinstance(c, (str, C)) else c
+            for c in [*self.group_by, *self.cols]
+        ]
-        return step_result(q, grouped_query.selected_columns)
+        return sqlalchemy.select(*cols).select_from(subquery).group_by(*self.group_by)
 def _validate_columns(
@@ -1130,25 +1122,14 @@ class DatasetQuery:
             query.steps = query.steps[-1:] + query.steps[:-1]
         result = query.starting_step.apply()
-        group_by = None
         self.dependencies.update(result.dependencies)
         for step in query.steps:
-            if isinstance(step, GroupBy):
-                if group_by is not None:
-                    raise TypeError("only one group_by allowed")
-                group_by = step
-                continue
             result = step.apply(
                 result.query_generator, self.temp_table_names
             )  # a chain of steps linked by results
             self.dependencies.update(result.dependencies)
-        if group_by:
-            result = group_by.apply(result.query_generator, self.temp_table_names)
-            self.dependencies.update(result.dependencies)
         return result.query_generator
     @staticmethod
@@ -1410,9 +1391,13 @@ class DatasetQuery:
         return query.as_scalar()
     @detach
-    def group_by(self, *cols: ColumnElement) -> "Self":
+    def group_by(
+        self,
+        cols: Sequence[ColumnElement],
+        group_by: Sequence[ColumnElement],
+    ) -> "Self":
         query = self.clone()
-        query.steps.append(GroupBy(cols))
+        query.steps.append(SQLGroupBy(cols, group_by))
         return query
     @detach
@@ -1591,6 +1576,8 @@ class DatasetQuery:
             )
             version = version or dataset.latest_version
+            self.session.add_dataset_version(dataset=dataset, version=version)
             dr = self.catalog.warehouse.dataset_rows(dataset)
             self.catalog.warehouse.copy_table(dr.get_table(), query.select())

datachain/query/schema.py CHANGED Viewed

@@ -1,16 +1,13 @@
 import functools
-import json
 from abc import ABC, abstractmethod
-from datetime import datetime, timezone
 from fnmatch import fnmatch
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 import attrs
 import sqlalchemy as sa
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from datachain.lib.file import File
-from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
@@ -228,61 +225,4 @@ def normalize_param(param: UDFParamSpec) -> UDFParameter:
     raise TypeError(f"Invalid UDF parameter: {param}")
-class DatasetRow:
-    schema: ClassVar[dict[str, type[SQLType]]] = {
-        "source": String,
-        "path": String,
-        "size": Int64,
-        "location": JSON,
-        "is_latest": Boolean,
-        "last_modified": DateTime,
-        "version": String,
-        "etag": String,
-    }
-    @staticmethod
-    def create(
-        path: str,
-        source: str = "",
-        size: int = 0,
-        location: Optional[dict[str, Any]] = None,
-        is_latest: bool = True,
-        last_modified: Optional[datetime] = None,
-        version: str = "",
-        etag: str = "",
-    ) -> tuple[
-        str,
-        str,
-        int,
-        Optional[str],
-        int,
-        bool,
-        datetime,
-        str,
-        str,
-        int,
-    ]:
-        if location:
-            location = json.dumps([location])  # type: ignore [assignment]
-        last_modified = last_modified or datetime.now(timezone.utc)
-        return (  # type: ignore [return-value]
-            source,
-            path,
-            size,
-            location,
-            is_latest,
-            last_modified,
-            version,
-            etag,
-        )
-    @staticmethod
-    def extend(**columns):
-        cols = {**DatasetRow.schema}
-        cols.update(columns)
-        return cols
 C = Column

datachain/query/session.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import atexit
+import gc
 import logging
-import os
 import re
 import sys
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, ClassVar, Optional
 from uuid import uuid4
 from datachain.catalog import get_catalog
@@ -11,6 +11,7 @@ from datachain.error import TableMissingError
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
+    from datachain.dataset import DatasetRecord
 logger = logging.getLogger("datachain")
@@ -39,7 +40,7 @@ class Session:
     """
     GLOBAL_SESSION_CTX: Optional["Session"] = None
-    GLOBAL_SESSION: Optional["Session"] = None
+    SESSION_CONTEXTS: ClassVar[list["Session"]] = []
     ORIGINAL_EXCEPT_HOOK = None
     DATASET_PREFIX = "session_"
@@ -64,18 +65,21 @@ class Session:
         session_uuid = uuid4().hex[: self.SESSION_UUID_LEN]
         self.name = f"{name}_{session_uuid}"
-        self.job_id = os.getenv("DATACHAIN_JOB_ID") or str(uuid4())
         self.is_new_catalog = not catalog
         self.catalog = catalog or get_catalog(
             client_config=client_config, in_memory=in_memory
         )
+        self.dataset_versions: list[tuple[DatasetRecord, int]] = []
     def __enter__(self):
+        # Push the current context onto the stack
+        Session.SESSION_CONTEXTS.append(self)
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
         if exc_type:
-            self._cleanup_created_versions(self.name)
+            self._cleanup_created_versions()
         self._cleanup_temp_datasets()
         if self.is_new_catalog:
@@ -83,6 +87,12 @@ class Session:
             self.catalog.warehouse.close_on_exit()
             self.catalog.id_generator.close_on_exit()
+        if Session.SESSION_CONTEXTS:
+            Session.SESSION_CONTEXTS.pop()
+    def add_dataset_version(self, dataset: "DatasetRecord", version: int) -> None:
+        self.dataset_versions.append((dataset, version))
     def generate_temp_dataset_name(self) -> str:
         return self.get_temp_prefix() + uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
@@ -98,21 +108,15 @@ class Session:
         except TableMissingError:
             pass
-    def _cleanup_created_versions(self, job_id: str) -> None:
-        versions = self.catalog.metastore.get_job_dataset_versions(job_id)
-        if not versions:
+    def _cleanup_created_versions(self) -> None:
+        if not self.dataset_versions:
             return
-        datasets = {}
-        for dataset_name, version in versions:
-            if dataset_name not in datasets:
-                datasets[dataset_name] = self.catalog.get_dataset(dataset_name)
-            dataset = datasets[dataset_name]
-            logger.info(
-                "Removing dataset version %s@%s due to exception", dataset_name, version
-            )
+        for dataset, version in self.dataset_versions:
             self.catalog.remove_dataset_version(dataset, version)
+        self.dataset_versions.clear()
     @classmethod
     def get(
         cls,
@@ -125,33 +129,34 @@ class Session:
         Parameters:
             session (Session): Optional Session(). If not provided a new session will
-                    be created. It's needed mostly for simplie API purposes.
-            catalog (Catalog): Optional catalog. By default a new catalog is created.
+                    be created. It's needed mostly for simple API purposes.
+            catalog (Catalog): Optional catalog. By default, a new catalog is created.
         """
         if session:
             return session
-        if cls.GLOBAL_SESSION is None:
+        # Access the active (most recent) context from the stack
+        if cls.SESSION_CONTEXTS:
+            return cls.SESSION_CONTEXTS[-1]
+        if cls.GLOBAL_SESSION_CTX is None:
             cls.GLOBAL_SESSION_CTX = Session(
                 cls.GLOBAL_SESSION_NAME,
                 catalog,
                 client_config=client_config,
                 in_memory=in_memory,
             )
-            cls.GLOBAL_SESSION = cls.GLOBAL_SESSION_CTX.__enter__()
             atexit.register(cls._global_cleanup)
             cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
             sys.excepthook = cls.except_hook
-        return cls.GLOBAL_SESSION
+        return cls.GLOBAL_SESSION_CTX
     @staticmethod
     def except_hook(exc_type, exc_value, exc_traceback):
+        Session.GLOBAL_SESSION_CTX.__exit__(exc_type, exc_value, exc_traceback)
         Session._global_cleanup()
-        if Session.GLOBAL_SESSION_CTX is not None:
-            job_id = Session.GLOBAL_SESSION_CTX.job_id
-            Session.GLOBAL_SESSION_CTX._cleanup_created_versions(job_id)
         if Session.ORIGINAL_EXCEPT_HOOK:
             Session.ORIGINAL_EXCEPT_HOOK(exc_type, exc_value, exc_traceback)
@@ -160,7 +165,6 @@ class Session:
     def cleanup_for_tests(cls):
         if cls.GLOBAL_SESSION_CTX is not None:
             cls.GLOBAL_SESSION_CTX.__exit__(None, None, None)
-            cls.GLOBAL_SESSION = None
             cls.GLOBAL_SESSION_CTX = None
             atexit.unregister(cls._global_cleanup)
@@ -171,3 +175,7 @@ class Session:
     def _global_cleanup():
         if Session.GLOBAL_SESSION_CTX is not None:
             Session.GLOBAL_SESSION_CTX.__exit__(None, None, None)
+        for obj in gc.get_objects():  # Get all tracked objects
+            if isinstance(obj, Session):  # Cleanup temp dataset for session variables.
+                obj.__exit__(None, None, None)

datachain 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

Potentially problematic release.

datachain 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl