PyPI - datachain - Versions diffs - 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

datachain 0.3.12py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (30) hide show

datachain/asyn.py +4 -9
datachain/cache.py +0 -1
datachain/catalog/catalog.py +3 -12
datachain/cli.py +4 -6
datachain/client/azure.py +1 -13
datachain/client/fsspec.py +7 -8
datachain/client/gcs.py +2 -13
datachain/client/hf.py +0 -10
datachain/client/local.py +3 -12
datachain/client/s3.py +9 -23
datachain/data_storage/schema.py +4 -8
datachain/data_storage/sqlite.py +10 -1
datachain/data_storage/warehouse.py +17 -34
datachain/lib/dc.py +0 -1
datachain/lib/file.py +0 -3
datachain/lib/listing.py +1 -2
datachain/lib/model_store.py +2 -2
datachain/lib/pytorch.py +32 -26
datachain/lib/signal_schema.py +146 -58
datachain/listing.py +8 -10
datachain/node.py +3 -68
datachain/query/builtins.py +0 -14
datachain/query/schema.py +1 -16
datachain/utils.py +0 -3
{datachain-0.3.12.dist-info → datachain-0.3.14.dist-info}/METADATA +1 -1
{datachain-0.3.12.dist-info → datachain-0.3.14.dist-info}/RECORD +30 -30
{datachain-0.3.12.dist-info → datachain-0.3.14.dist-info}/LICENSE +0 -0
{datachain-0.3.12.dist-info → datachain-0.3.14.dist-info}/WHEEL +0 -0
{datachain-0.3.12.dist-info → datachain-0.3.14.dist-info}/entry_points.txt +0 -0
{datachain-0.3.12.dist-info → datachain-0.3.14.dist-info}/top_level.txt +0 -0

datachain/asyn.py CHANGED Viewed

@@ -1,14 +1,8 @@
 import asyncio
-from collections.abc import Awaitable, Coroutine, Iterable
+from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
 from concurrent.futures import ThreadPoolExecutor
 from heapq import heappop, heappush
-from typing import (
-    Any,
-    Callable,
-    Generic,
-    Optional,
-    TypeVar,
-)
+from typing import Any, Callable, Generic, Optional, TypeVar
 from fsspec.asyn import get_loop
@@ -16,6 +10,7 @@ ASYNC_WORKERS = 20
 InputT = TypeVar("InputT", contravariant=True)  # noqa: PLC0105
 ResultT = TypeVar("ResultT", covariant=True)  # noqa: PLC0105
+T = TypeVar("T")
 class AsyncMapper(Generic[InputT, ResultT]):
@@ -226,7 +221,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
         self._push_result(self._next_yield, None)
-def iter_over_async(ait, loop):
+def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
     """Wrap an asynchronous iterator into a synchronous one"""
     ait = ait.__aiter__()

datachain/cache.py CHANGED Viewed

@@ -29,7 +29,6 @@ class UniqueId:
     etag: str
     version: str = ""
     is_latest: bool = True
-    vtype: str = ""
     location: Optional[str] = None
     last_modified: datetime = TIME_ZERO

datachain/catalog/catalog.py CHANGED Viewed

@@ -62,7 +62,7 @@ from datachain.listing import Listing
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
-from datachain.sql.types import JSON, Boolean, DateTime, Int, Int64, SQLType, String
+from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
 from datachain.storage import Storage, StorageStatus, StorageURI
 from datachain.utils import (
     DataChainDir,
@@ -513,8 +513,6 @@ def find_column_to_str(  # noqa: PLR0911
         )
     if column == "name":
         return posixpath.basename(row[field_lookup["path"]]) or ""
-    if column == "owner":
-        return row[field_lookup["owner_name"]] or ""
     if column == "path":
         is_dir = row[field_lookup["dir_type"]] == DirType.DIR
         path = row[field_lookup["path"]]
@@ -666,16 +664,12 @@ class Catalog:
         source_metastore = self.metastore.clone(client.uri)
         columns = [
-            Column("vtype", String),
-            Column("dir_type", Int),
             Column("path", String),
             Column("etag", String),
             Column("version", String),
             Column("is_latest", Boolean),
             Column("last_modified", DateTime(timezone=True)),
             Column("size", Int64),
-            Column("owner_name", String),
-            Column("owner_id", String),
             Column("location", JSON),
             Column("source", String),
         ]
@@ -1396,12 +1390,12 @@ class Catalog:
         dataset = self.get_dataset(name)
         return self.warehouse.dataset_table_export_file_names(dataset, version)
-    def dataset_stats(self, name: str, version: int) -> DatasetStats:
+    def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
         """
         Returns tuple with dataset stats: total number of rows and total dataset size.
         """
         dataset = self.get_dataset(name)
-        dataset_version = dataset.get_version(version)
+        dataset_version = dataset.get_version(version or dataset.latest_version)
         return DatasetStats(
             num_objects=dataset_version.num_objects,
             size=dataset_version.size,
@@ -1516,7 +1510,6 @@ class Catalog:
             row["etag"],
             row["version"],
             row["is_latest"],
-            row["vtype"],
             row["location"],
             row["last_modified"],
         )
@@ -1987,8 +1980,6 @@ class Catalog:
                 field_set.add("path")
             elif column == "name":
                 field_set.add("path")
-            elif column == "owner":
-                field_set.add("owner_name")
             elif column == "path":
                 field_set.add("dir_type")
                 field_set.add("path")

datachain/cli.py CHANGED Viewed

@@ -24,7 +24,7 @@ logger = logging.getLogger("datachain")
 TTL_HUMAN = "4h"
 TTL_INT = 4 * 60 * 60
-FIND_COLUMNS = ["du", "name", "owner", "path", "size", "type"]
+FIND_COLUMNS = ["du", "name", "path", "size", "type"]
 def human_time_type(value_str: str, can_be_none: bool = False) -> Optional[int]:
@@ -579,9 +579,8 @@ def _node_data_to_ls_values(row, long_format=False):
     value = name + ending
     if long_format:
         last_modified = row[2]
-        owner_name = row[3]
         timestamp = last_modified if not is_dir else None
-        return long_line_str(value, timestamp, owner_name)
+        return long_line_str(value, timestamp)
     return value
@@ -599,7 +598,7 @@ def _ls_urls_flat(
         if client_cls.is_root_url(source):
             buckets = client_cls.ls_buckets(**catalog.client_config)
             if long:
-                values = (long_line_str(b.name, b.created, "") for b in buckets)
+                values = (long_line_str(b.name, b.created) for b in buckets)
             else:
                 values = (b.name for b in buckets)
             yield source, values
@@ -607,7 +606,7 @@ def _ls_urls_flat(
             found = False
             fields = ["name", "dir_type"]
             if long:
-                fields.extend(["last_modified", "owner_name"])
+                fields.append("last_modified")
             for data_source, results in catalog.ls([source], fields=fields, **kwargs):
                 values = (_node_data_to_ls_values(r, long) for r in results)
                 found = True
@@ -683,7 +682,6 @@ def ls_remote(
                 entry = long_line_str(
                     row["name"] + ("/" if row["dir_type"] else ""),
                     row["last_modified"],
-                    row["owner_name"],
                 )
                 print(format_ls_entry(entry))
         else:

datachain/client/azure.py CHANGED Viewed

@@ -4,7 +4,6 @@ from adlfs import AzureBlobFileSystem
 from tqdm import tqdm
 from datachain.lib.file import File
-from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -14,17 +13,6 @@ class AzureClient(Client):
     PREFIX = "az://"
     protocol = "az"
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        version_id = v.get("version_id")
-        return Entry.from_file(
-            path=path,
-            etag=v.get("etag", "").strip('"'),
-            version=version_id or "",
-            is_latest=version_id is None or bool(v.get("is_current_version")),
-            last_modified=v["last_modified"],
-            size=v.get("size", ""),
-        )
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         version_id = v.get("version_id")
         return File(
@@ -57,7 +45,7 @@ class AzureClient(Client):
                                 continue
                             info = (await self.fs._details([b]))[0]
                             entries.append(
-                                self.convert_info(info, self.rel_path(info["name"]))
+                                self.info_to_file(info, self.rel_path(info["name"]))
                             )
                         if entries:
                             await result_queue.put(entries)

datachain/client/fsspec.py CHANGED Viewed

@@ -29,7 +29,7 @@ from tqdm import tqdm
 from datachain.cache import DataChainCache, UniqueId
 from datachain.client.fileslice import FileSlice, FileWrapper
 from datachain.error import ClientError as DataChainClientError
-from datachain.node import Entry
+from datachain.lib.file import File
 from datachain.nodes_fetcher import NodesFetcher
 from datachain.nodes_thread_pool import NodeChunk
 from datachain.storage import StorageURI
@@ -45,7 +45,7 @@ DELIMITER = "/"  # Path delimiter.
 DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
-ResultQueue = asyncio.Queue[Optional[Sequence[Entry]]]
+ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
 def _is_win_local_path(uri: str) -> bool:
@@ -188,7 +188,7 @@ class Client(ABC):
     async def get_current_etag(self, uid: UniqueId) -> str:
         info = await self.fs._info(self.get_full_path(uid.path))
-        return self.convert_info(info, "").etag
+        return self.info_to_file(info, "").etag
     async def get_size(self, path: str) -> int:
         return await self.fs._size(path)
@@ -198,7 +198,7 @@ class Client(ABC):
     async def scandir(
         self, start_prefix: str, method: str = "default"
-    ) -> AsyncIterator[Sequence[Entry]]:
+    ) -> AsyncIterator[Sequence[File]]:
         try:
             impl = getattr(self, f"_fetch_{method}")
         except AttributeError:
@@ -264,7 +264,7 @@ class Client(ABC):
     ) -> None:
         await self._fetch_nested(start_prefix, result_queue)
-    async def _fetch_dir(self, prefix, pbar, result_queue) -> set[str]:
+    async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
         path = f"{self.name}/{prefix}"
         infos = await self.ls_dir(path)
         files = []
@@ -277,7 +277,7 @@ class Client(ABC):
             if info["type"] == "directory":
                 subdirs.add(subprefix)
             else:
-                files.append(self.convert_info(info, subprefix))
+                files.append(self.info_to_file(info, subprefix))
         if files:
             await result_queue.put(files)
         found_count = len(subdirs) + len(files)
@@ -303,7 +303,7 @@ class Client(ABC):
         return f"{self.PREFIX}{self.name}/{rel_path}"
     @abstractmethod
-    def convert_info(self, v: dict[str, Any], parent: str) -> Entry: ...
+    def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
     def fetch_nodes(
         self,
@@ -363,7 +363,6 @@ class Client(ABC):
             parent["path"],
             parent["size"],
             parent["etag"],
-            vtype=parent["vtype"],
             location=parent["location"],
         )
         f = self.open_object(parent_uid, use_cache=use_cache)

datachain/client/gcs.py CHANGED Viewed

@@ -10,7 +10,6 @@ from gcsfs import GCSFileSystem
 from tqdm import tqdm
 from datachain.lib.file import File
-from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -108,19 +107,9 @@ class GCSClient(Client):
         finally:
             await page_queue.put(None)
-    def _entry_from_dict(self, d: dict[str, Any]) -> Entry:
+    def _entry_from_dict(self, d: dict[str, Any]) -> File:
         info = self.fs._process_object(self.name, d)
-        return self.convert_info(info, self.rel_path(info["name"]))
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        return Entry.from_file(
-            path=path,
-            etag=v.get("etag", ""),
-            version=v.get("generation", ""),
-            is_latest=not v.get("timeDeleted"),
-            last_modified=self.parse_timestamp(v["updated"]),
-            size=v.get("size", ""),
-        )
+        return self.info_to_file(info, self.rel_path(info["name"]))
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(

datachain/client/hf.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Any, cast
 from huggingface_hub import HfFileSystem
 from datachain.lib.file import File
-from datachain.node import Entry
 from .fsspec import Client
@@ -22,15 +21,6 @@ class HfClient(Client):
         return cast(HfFileSystem, super().create_fs(**kwargs))
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        return Entry.from_file(
-            path=path,
-            size=v["size"],
-            version=v["last_commit"].oid,
-            etag=v.get("blob_id", ""),
-            last_modified=v["last_commit"].date,
-        )
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(
             path=path,

datachain/client/local.py CHANGED Viewed

@@ -7,8 +7,8 @@ from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
+from datachain.cache import UniqueId
 from datachain.lib.file import File
-from datachain.node import Entry
 from datachain.storage import StorageURI
 from .fsspec import Client
@@ -114,9 +114,9 @@ class FileClient(Client):
             use_symlinks=use_symlinks,
         )
-    async def get_current_etag(self, uid) -> str:
+    async def get_current_etag(self, uid: UniqueId) -> str:
         info = self.fs.info(self.get_full_path(uid.path))
-        return self.convert_info(info, "").etag
+        return self.info_to_file(info, "").etag
     async def get_size(self, path: str) -> int:
         return self.fs.size(path)
@@ -136,15 +136,6 @@ class FileClient(Client):
             full_path += "/"
         return full_path
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        return Entry.from_file(
-            path=path,
-            etag=v["mtime"].hex(),
-            is_latest=True,
-            last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
-            size=v.get("size", ""),
-        )
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(
             source=self.uri,

datachain/client/s3.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import asyncio
-from typing import Any, cast
+from typing import Any, Optional, cast
 from botocore.exceptions import NoCredentialsError
 from s3fs import S3FileSystem
 from tqdm import tqdm
 from datachain.lib.file import File
-from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -111,24 +110,23 @@ class ClientS3(Client):
     ) -> None:
         await self._fetch_flat(start_prefix, result_queue)
-    def _entry_from_boto(self, v, bucket, versions=False):
-        return Entry.from_file(
+    def _entry_from_boto(self, v, bucket, versions=False) -> File:
+        return File(
+            source=self.uri,
             path=v["Key"],
             etag=v.get("ETag", "").strip('"'),
             version=ClientS3.clean_s3_version(v.get("VersionId", "")),
             is_latest=v.get("IsLatest", True),
             last_modified=v.get("LastModified", ""),
             size=v["Size"],
-            owner_name=v.get("Owner", {}).get("DisplayName", ""),
-            owner_id=v.get("Owner", {}).get("ID", ""),
         )
     async def _fetch_dir(
         self,
         prefix,
         pbar,
-        result_queue,
-    ):
+        result_queue: ResultQueue,
+    ) -> set[str]:
         if prefix:
             prefix = prefix.lstrip(DELIMITER) + DELIMITER
         files = []
@@ -143,7 +141,7 @@ class ClientS3(Client):
             if info["type"] == "directory":
                 subdirs.add(subprefix)
             else:
-                files.append(self.convert_info(info, subprefix))
+                files.append(self.info_to_file(info, subprefix))
                 pbar.update()
             found = True
         if not found:
@@ -154,20 +152,8 @@ class ClientS3(Client):
         return subdirs
     @staticmethod
-    def clean_s3_version(ver):
-        return ver if ver != "null" else ""
-    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
-        return Entry.from_file(
-            path=path,
-            etag=v.get("ETag", "").strip('"'),
-            version=ClientS3.clean_s3_version(v.get("VersionId", "")),
-            is_latest=v.get("IsLatest", True),
-            last_modified=v.get("LastModified", ""),
-            size=v["size"],
-            owner_name=v.get("Owner", {}).get("DisplayName", ""),
-            owner_id=v.get("Owner", {}).get("ID", ""),
-        )
+    def clean_s3_version(ver: Optional[str]) -> str:
+        return ver if (ver is not None and ver != "null") else ""
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(

datachain/data_storage/schema.py CHANGED Viewed

@@ -10,9 +10,8 @@ from typing import (
 import sqlalchemy as sa
 from sqlalchemy.sql import func as f
-from sqlalchemy.sql.expression import null, true
+from sqlalchemy.sql.expression import false, null, true
-from datachain.node import DirType
 from datachain.sql.functions import path
 from datachain.sql.types import Int, SQLType, UInt64
@@ -81,8 +80,7 @@ class DirExpansion:
     def base_select(q):
         return sa.select(
             q.c.sys__id,
-            q.c.vtype,
-            (q.c.dir_type == DirType.DIR).label("is_dir"),
+            false().label("is_dir"),
             q.c.source,
             q.c.path,
             q.c.version,
@@ -94,7 +92,6 @@ class DirExpansion:
         return (
             sa.select(
                 f.min(q.c.sys__id).label("sys__id"),
-                q.c.vtype,
                 q.c.is_dir,
                 q.c.source,
                 q.c.path,
@@ -102,8 +99,8 @@ class DirExpansion:
                 f.max(q.c.location).label("location"),
             )
             .select_from(q)
-            .group_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
-            .order_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
+            .group_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
+            .order_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
         )
     @classmethod
@@ -113,7 +110,6 @@ class DirExpansion:
         q = q.union_all(
             sa.select(
                 sa.literal(-1).label("sys__id"),
-                sa.literal("").label("vtype"),
                 true().label("is_dir"),
                 q.c.source,
                 parent.label("path"),

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -43,6 +43,8 @@ if TYPE_CHECKING:
     from sqlalchemy.sql.elements import ColumnElement
     from sqlalchemy.types import TypeEngine
+    from datachain.lib.file import File
 logger = logging.getLogger("datachain")
@@ -58,6 +60,10 @@ quote_schema = sqlite_dialect.identifier_preparer.quote_schema
 quote = sqlite_dialect.identifier_preparer.quote
+def _get_in_memory_uri():
+    return "file::memory:?cache=shared"
 def get_retry_sleep_sec(retry_count: int) -> int:
     return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
@@ -119,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
             if db_file == ":memory:":
                 # Enable multithreaded usage of the same in-memory db
                 db = sqlite3.connect(
-                    "file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
+                    _get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
                 )
             else:
                 db = sqlite3.connect(
@@ -704,6 +710,9 @@ class SQLiteWarehouse(AbstractWarehouse):
         self.db.execute(insert_query)
+    def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
+        return (e.model_dump() for e in entries)
     def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
         rows = list(rows)
         if not rows:

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -20,7 +20,7 @@ from datachain.client import Client
 from datachain.data_storage.schema import convert_rows_custom_column_types
 from datachain.data_storage.serializer import Serializable
 from datachain.dataset import DatasetRecord
-from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
+from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
 from datachain.sql.functions import path as pathfunc
 from datachain.sql.types import Int, SQLType
 from datachain.storage import StorageURI
@@ -28,13 +28,13 @@ from datachain.utils import sql_escape_like
 if TYPE_CHECKING:
     from sqlalchemy.sql._typing import _ColumnsClauseArgument
-    from sqlalchemy.sql.elements import ColumnElement
     from sqlalchemy.sql.selectable import Select
     from sqlalchemy.types import TypeEngine
     from datachain.data_storage import AbstractIDGenerator, schema
     from datachain.data_storage.db_engine import DatabaseEngine
     from datachain.data_storage.schema import DataTable
+    from datachain.lib.file import File
 try:
     import numpy as np
@@ -341,9 +341,7 @@ class AbstractWarehouse(ABC, Serializable):
         column_objects = [dr.c[c] for c in column_names]
         # include all object types - file, tar archive, tar file (subobject)
-        select_query = dr.select(*column_objects).where(
-            dr.c.dir_type.in_(DirTypeGroup.FILE) & (dr.c.is_latest == true())
-        )
+        select_query = dr.select(*column_objects).where(dr.c.is_latest == true())
         if path is None:
             return select_query
         if recursive:
@@ -404,26 +402,18 @@ class AbstractWarehouse(ABC, Serializable):
         expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
             sa.func.count(table.c.sys__id),
         )
-        if "file__size" in table.columns:
-            expressions = (*expressions, sa.func.sum(table.c.file__size))
-        elif "size" in table.columns:
-            expressions = (*expressions, sa.func.sum(table.c.size))
+        size_columns = [
+            c for c in table.columns if c.name == "size" or c.name.endswith("__size")
+        ]
+        if size_columns:
+            expressions = (*expressions, sa.func.sum(sum(size_columns)))
         query = select(*expressions)
         ((nrows, *rest),) = self.db.execute(query)
-        return nrows, rest[0] if rest else None
-    def prepare_entries(
-        self, uri: str, entries: Iterable[Entry]
-    ) -> list[dict[str, Any]]:
-        """
-        Prepares bucket listing entry (row) for inserting into database
-        """
-        def _prepare_entry(entry: Entry):
-            assert entry.dir_type is not None
-            return attrs.asdict(entry) | {"source": uri}
+        return nrows, rest[0] if rest else 0
-        return [_prepare_entry(e) for e in entries]
+    @abstractmethod
+    def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
+        """Convert File entries so they can be passed on to `insert_rows()`"""
     @abstractmethod
     def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
@@ -440,7 +430,7 @@ class AbstractWarehouse(ABC, Serializable):
         """Inserts dataset rows directly into dataset table"""
     @abstractmethod
-    def instr(self, source, target) -> "ColumnElement":
+    def instr(self, source, target) -> sa.ColumnElement:
         """
         Return SQLAlchemy Boolean determining if a target substring is present in
         source string column
@@ -500,7 +490,7 @@ class AbstractWarehouse(ABC, Serializable):
         c = query.selected_columns
         q = query.where(c.dir_type.in_(file_group))
         if not include_subobjects:
-            q = q.where(c.vtype == "")
+            q = q.where((c.location == "") | (c.location.is_(None)))
         return q
     def get_nodes(self, query) -> Iterator[Node]:
@@ -624,8 +614,7 @@ class AbstractWarehouse(ABC, Serializable):
         return sa.select(
             de.c.sys__id,
-            with_default(dr.c.vtype),
-            case((de.c.is_dir == true(), DirType.DIR), else_=dr.c.dir_type).label(
+            case((de.c.is_dir == true(), DirType.DIR), else_=DirType.FILE).label(
                 "dir_type"
             ),
             de.c.path,
@@ -634,8 +623,6 @@ class AbstractWarehouse(ABC, Serializable):
             with_default(dr.c.is_latest),
             dr.c.last_modified,
             with_default(dr.c.size),
-            with_default(dr.c.owner_name),
-            with_default(dr.c.owner_id),
             with_default(dr.c.sys__rand),
             dr.c.location,
             de.c.source,
@@ -650,7 +637,6 @@ class AbstractWarehouse(ABC, Serializable):
             query = dr.select().where(
                 self.path_expr(dr) == path,
                 dr.c.is_latest == true(),
-                dr.c.dir_type != DirType.DIR,
             )
             row = next(self.db.execute(query), None)
             if row is not None:
@@ -660,7 +646,6 @@ class AbstractWarehouse(ABC, Serializable):
             dr.select()
             .where(
                 dr.c.is_latest == true(),
-                dr.c.dir_type != DirType.DIR,
                 dr.c.path.startswith(path),
             )
             .exists()
@@ -761,13 +746,11 @@ class AbstractWarehouse(ABC, Serializable):
         sub_glob = posixpath.join(path, "*")
         dr = dataset_rows
-        selections = [
+        selections: list[sa.ColumnElement] = [
             func.sum(dr.c.size),
         ]
         if count_files:
-            selections.append(
-                func.sum(dr.c.dir_type.in_(DirTypeGroup.FILE)),
-            )
+            selections.append(func.count())
         results = next(
             self.db.execute(
                 dr.select(*selections).where(

datachain/lib/dc.py CHANGED Viewed

@@ -234,7 +234,6 @@ class DataChain(DatasetQuery):
     DEFAULT_FILE_RECORD: ClassVar[dict] = {
         "source": "",
         "path": "",
-        "vtype": "",
         "size": 0,
     }

datachain 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

Potentially problematic release.

datachain 0.3.12py3-none-any.whl → 0.3.14py3-none-any.whl