PyPI - datachain - Versions diffs - 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

datachain 0.11.0py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (39) hide show

datachain/catalog/catalog.py +33 -5
datachain/catalog/loader.py +19 -13
datachain/cli/__init__.py +3 -1
datachain/cli/commands/show.py +12 -1
datachain/cli/parser/studio.py +13 -1
datachain/cli/parser/utils.py +6 -0
datachain/client/fsspec.py +12 -16
datachain/client/hf.py +36 -14
datachain/client/local.py +1 -4
datachain/data_storage/warehouse.py +3 -8
datachain/dataset.py +8 -0
datachain/error.py +0 -12
datachain/fs/utils.py +30 -0
datachain/func/__init__.py +5 -0
datachain/func/func.py +2 -1
datachain/lib/data_model.py +6 -0
datachain/lib/dc.py +114 -28
datachain/lib/file.py +100 -25
datachain/lib/image.py +30 -6
datachain/lib/listing.py +21 -39
datachain/lib/signal_schema.py +194 -15
datachain/lib/video.py +7 -5
datachain/model/bbox.py +209 -58
datachain/model/pose.py +49 -37
datachain/model/segment.py +22 -18
datachain/model/ultralytics/bbox.py +9 -9
datachain/model/ultralytics/pose.py +7 -7
datachain/model/ultralytics/segment.py +7 -7
datachain/model/utils.py +191 -0
datachain/nodes_thread_pool.py +32 -11
datachain/query/dataset.py +4 -2
datachain/studio.py +8 -6
datachain/utils.py +3 -16
{datachain-0.11.0.dist-info → datachain-0.12.0.dist-info}/METADATA +6 -4
{datachain-0.11.0.dist-info → datachain-0.12.0.dist-info}/RECORD +39 -37
{datachain-0.11.0.dist-info → datachain-0.12.0.dist-info}/WHEEL +1 -1
{datachain-0.11.0.dist-info → datachain-0.12.0.dist-info}/LICENSE +0 -0
{datachain-0.11.0.dist-info → datachain-0.12.0.dist-info}/entry_points.txt +0 -0
{datachain-0.11.0.dist-info → datachain-0.12.0.dist-info}/top_level.txt +0 -0

datachain/lib/listing.py CHANGED Viewed

@@ -1,19 +1,21 @@
+import glob
 import logging
 import os
 import posixpath
 from collections.abc import Iterator
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
 from fsspec.asyn import get_loop
 from sqlalchemy.sql.expression import true
+import datachain.fs.utils as fsutils
 from datachain.asyn import iter_over_async
 from datachain.client import Client
-from datachain.error import REMOTE_ERRORS, ClientError
+from datachain.error import ClientError
 from datachain.lib.file import File
 from datachain.query.schema import Column
 from datachain.sql.functions import path as pathfunc
-from datachain.telemetry import telemetry
 from datachain.utils import uses_glob
 if TYPE_CHECKING:
@@ -92,38 +94,6 @@ def ls(
     return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
-def _isfile(client: "Client", path: str) -> bool:
-    """
-    Returns True if uri points to a file
-    """
-    try:
-        if "://" in path:
-            # This makes sure that the uppercase scheme is converted to lowercase
-            scheme, path = path.split("://", 1)
-            path = f"{scheme.lower()}://{path}"
-        if os.name == "nt" and "*" in path:
-            # On Windows, the glob pattern "*" is not supported
-            return False
-        info = client.fs.info(path)
-        name = info.get("name")
-        # case for special simulated directories on some clouds
-        # e.g. Google creates a zero byte file with the same name as the
-        # directory with a trailing slash at the end
-        if not name or name.endswith("/"):
-            return False
-        return info["type"] == "file"
-    except FileNotFoundError:
-        return False
-    except REMOTE_ERRORS as e:
-        raise ClientError(
-            message=str(e),
-            error_code=getattr(e, "code", None),
-        ) from e
 def parse_listing_uri(uri: str, client_config) -> tuple[str, str, str]:
     """
     Parsing uri and returns listing dataset name, listing uri and listing path
@@ -156,8 +126,16 @@ def listing_uri_from_name(dataset_name: str) -> str:
     return dataset_name.removeprefix(LISTING_PREFIX)
+@contextmanager
+def _reraise_as_client_error() -> Iterator[None]:
+    try:
+        yield
+    except Exception as e:
+        raise ClientError(message=str(e), error_code=getattr(e, "code", None)) from e
 def get_listing(
-    uri: str, session: "Session", update: bool = False
+    uri: Union[str, os.PathLike[str]], session: "Session", update: bool = False
 ) -> tuple[Optional[str], str, str, bool]:
     """Returns correct listing dataset name that must be used for saving listing
     operation. It takes into account existing listings and reusability of those.
@@ -167,6 +145,7 @@ def get_listing(
     be used to find rows based on uri.
     """
     from datachain.client.local import FileClient
+    from datachain.telemetry import telemetry
     catalog = session.catalog
     cache = catalog.cache
@@ -174,11 +153,14 @@ def get_listing(
     client = Client.get_client(uri, cache, **client_config)
     telemetry.log_param("client", client.PREFIX)
+    if not isinstance(uri, str):
+        uri = os.fspath(uri)
     # we don't want to use cached dataset (e.g. for a single file listing)
-    if not uri.endswith("/") and _isfile(client, uri):
-        storage_uri, path = Client.parse_url(uri)
-        return None, f"{storage_uri}/{path.lstrip('/')}", path, False
+    isfile = _reraise_as_client_error()(fsutils.isfile)
+    if not glob.has_magic(uri) and not uri.endswith("/") and isfile(client.fs, uri):
+        _, path = Client.parse_url(uri)
+        return None, uri, path, False
     ds_name, list_uri, list_path = parse_listing_uri(uri, client_config)
     listing = None

datachain/lib/signal_schema.py CHANGED Viewed

@@ -91,6 +91,7 @@ class CustomType(BaseModel):
     name: str
     fields: dict[str, str]
     bases: list[tuple[str, str, Optional[str]]]
+    hidden_fields: Optional[list[str]] = None
     @classmethod
     def deserialize(cls, data: dict[str, Any], type_name: str) -> "CustomType":
@@ -102,6 +103,7 @@ class CustomType(BaseModel):
                 "name": type_name,
                 "fields": data,
                 "bases": [],
+                "hidden_fields": [],
             }
         return cls(**data)
@@ -179,6 +181,16 @@ class SignalSchema:
                 )
         return SignalSchema(signals)
+    @staticmethod
+    def _get_bases(fr: type) -> list[tuple[str, str, Optional[str]]]:
+        bases: list[tuple[str, str, Optional[str]]] = []
+        for base in fr.__mro__:
+            model_store_name = (
+                ModelStore.get_name(base) if issubclass(base, DataModel) else None
+            )
+            bases.append((base.__name__, base.__module__, model_store_name))
+        return bases
     @staticmethod
     def _serialize_custom_model(
         version_name: str, fr: type[BaseModel], custom_types: dict[str, Any]
@@ -196,14 +208,15 @@ class SignalSchema:
             assert field_type
             fields[field_name] = SignalSchema._serialize_type(field_type, custom_types)
-        bases: list[tuple[str, str, Optional[str]]] = []
-        for type_ in fr.__mro__:
-            model_store_name = (
-                ModelStore.get_name(type_) if issubclass(type_, DataModel) else None
-            )
-            bases.append((type_.__name__, type_.__module__, model_store_name))
+        bases = SignalSchema._get_bases(fr)
-        ct = CustomType(schema_version=2, name=version_name, fields=fields, bases=bases)
+        ct = CustomType(
+            schema_version=2,
+            name=version_name,
+            fields=fields,
+            bases=bases,
+            hidden_fields=getattr(fr, "_hidden_fields", []),
+        )
         custom_types[version_name] = ct.model_dump()
         return version_name
@@ -384,6 +397,37 @@ class SignalSchema:
         return SignalSchema(signals)
+    @staticmethod
+    def get_flatten_hidden_fields(schema):
+        custom_types = schema.get("_custom_types", {})
+        if not custom_types:
+            return []
+        hidden_by_types = {
+            name: schema.get("hidden_fields", [])
+            for name, schema in custom_types.items()
+        }
+        hidden_fields = []
+        def traverse(prefix, schema_info):
+            for field, field_type in schema_info.items():
+                if field == "_custom_types":
+                    continue
+                if field_type in custom_types:
+                    hidden_fields.extend(
+                        f"{prefix}{field}__{f}" for f in hidden_by_types[field_type]
+                    )
+                    traverse(
+                        prefix + field + "__",
+                        custom_types[field_type].get("fields", {}),
+                    )
+        traverse("", schema)
+        return hidden_fields
     def to_udf_spec(self) -> dict[str, type]:
         res = {}
         for path, type_, has_subtree, _ in self.get_flat_tree():
@@ -479,7 +523,7 @@ class SignalSchema:
         raise SignalResolvingError([col_name], "is not found")
     def db_signals(
-        self, name: Optional[str] = None, as_columns=False
+        self, name: Optional[str] = None, as_columns=False, include_hidden: bool = True
     ) -> Union[list[str], list[Column]]:
         """
         Returns DB columns as strings or Column objects with proper types
@@ -489,7 +533,9 @@ class SignalSchema:
             DEFAULT_DELIMITER.join(path)
             if not as_columns
             else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
-            for path, _type, has_subtree, _ in self.get_flat_tree()
+            for path, _type, has_subtree, _ in self.get_flat_tree(
+                include_hidden=include_hidden
+            )
             if not has_subtree
         ]
@@ -624,19 +670,31 @@ class SignalSchema:
             for name, val in values.items()
         }
-    def get_flat_tree(self) -> Iterator[tuple[list[str], DataType, bool, int]]:
-        yield from self._get_flat_tree(self.tree, [], 0)
+    def get_flat_tree(
+        self, include_hidden: bool = True
+    ) -> Iterator[tuple[list[str], DataType, bool, int]]:
+        yield from self._get_flat_tree(self.tree, [], 0, include_hidden)
     def _get_flat_tree(
-        self, tree: dict, prefix: list[str], depth: int
+        self, tree: dict, prefix: list[str], depth: int, include_hidden: bool
     ) -> Iterator[tuple[list[str], DataType, bool, int]]:
         for name, (type_, substree) in tree.items():
             suffix = name.split(".")
             new_prefix = prefix + suffix
+            hidden_fields = getattr(type_, "_hidden_fields", None)
+            if hidden_fields and substree and not include_hidden:
+                substree = {
+                    field: info
+                    for field, info in substree.items()
+                    if field not in hidden_fields
+                }
             has_subtree = substree is not None
             yield new_prefix, type_, has_subtree, depth
             if substree is not None:
-                yield from self._get_flat_tree(substree, new_prefix, depth + 1)
+                yield from self._get_flat_tree(
+                    substree, new_prefix, depth + 1, include_hidden
+                )
     def print_tree(self, indent: int = 4, start_at: int = 0):
         for path, type_, _, depth in self.get_flat_tree():
@@ -649,9 +707,13 @@ class SignalSchema:
                     sub_schema = SignalSchema({"* list of": args[0]})
                     sub_schema.print_tree(indent=indent, start_at=total_indent + indent)
-    def get_headers_with_length(self):
+    def get_headers_with_length(self, include_hidden: bool = True):
         paths = [
-            path for path, _, has_subtree, _ in self.get_flat_tree() if not has_subtree
+            path
+            for path, _, has_subtree, _ in self.get_flat_tree(
+                include_hidden=include_hidden
+            )
+            if not has_subtree
         ]
         max_length = max([len(path) for path in paths], default=0)
         return [
@@ -749,3 +811,120 @@ class SignalSchema:
             res[name] = (anno, subtree)  # type: ignore[assignment]
         return res
+    def to_partial(self, *columns: str) -> "SignalSchema":
+        """
+        Convert the schema to a partial schema with only the specified columns.
+        E.g. if original schema is:
+            ```
+            signal: Foo@v1
+                name: str
+                value: float
+            count: int
+            ```
+        Then `to_partial("signal.name", "count")` will return a partial schema:
+            ```
+            signal: FooPartial@v1
+                name: str
+            count: int
+            ```
+        Note that partial schema will have a different name for the custom types
+        (e.g. `FooPartial@v1` instead of `Foo@v1`) to avoid conflicts
+        with the original schema.
+        Args:
+            *columns (str): The columns to include in the partial schema.
+        Returns:
+            SignalSchema: The new partial schema.
+        """
+        serialized = self.serialize()
+        custom_types = serialized.get("_custom_types", {})
+        schema: dict[str, Any] = {}
+        schema_custom_types: dict[str, CustomType] = {}
+        data_model_bases: Optional[list[tuple[str, str, Optional[str]]]] = None
+        signal_partials: dict[str, str] = {}
+        partial_versions: dict[str, int] = {}
+        def _type_name_to_partial(signal_name: str, type_name: str) -> str:
+            if "@" not in type_name:
+                return type_name
+            model_name, _ = ModelStore.parse_name_version(type_name)
+            if signal_name not in signal_partials:
+                partial_versions.setdefault(model_name, 0)
+                partial_versions[model_name] += 1
+                version = partial_versions[model_name]
+                signal_partials[signal_name] = f"{model_name}Partial{version}"
+            return signal_partials[signal_name]
+        for column in columns:
+            parent_type, parent_type_partial = "", ""
+            column_parts = column.split(".")
+            for i, signal in enumerate(column_parts):
+                if i == 0:
+                    if signal not in serialized:
+                        raise SignalSchemaError(
+                            f"Column {column} not found in the schema"
+                        )
+                    parent_type = serialized[signal]
+                    parent_type_partial = _type_name_to_partial(signal, parent_type)
+                    schema[signal] = parent_type_partial
+                    continue
+                if parent_type not in custom_types:
+                    raise SignalSchemaError(
+                        f"Custom type {parent_type} not found in the schema"
+                    )
+                custom_type = custom_types[parent_type]
+                signal_type = custom_type["fields"].get(signal)
+                if not signal_type:
+                    raise SignalSchemaError(
+                        f"Field {signal} not found in custom type {parent_type}"
+                    )
+                partial_type = _type_name_to_partial(
+                    ".".join(column_parts[: i + 1]),
+                    signal_type,
+                )
+                if parent_type_partial in schema_custom_types:
+                    schema_custom_types[parent_type_partial].fields[signal] = (
+                        partial_type
+                    )
+                else:
+                    if data_model_bases is None:
+                        data_model_bases = SignalSchema._get_bases(DataModel)
+                    partial_type_name, _ = ModelStore.parse_name_version(partial_type)
+                    schema_custom_types[parent_type_partial] = CustomType(
+                        schema_version=2,
+                        name=partial_type_name,
+                        fields={signal: partial_type},
+                        bases=[
+                            (partial_type_name, "__main__", partial_type),
+                            *data_model_bases,
+                        ],
+                    )
+                parent_type, parent_type_partial = signal_type, partial_type
+        if schema_custom_types:
+            schema["_custom_types"] = {
+                type_name: ct.model_dump()
+                for type_name, ct in schema_custom_types.items()
+            }
+        return SignalSchema.deserialize(schema)

datachain/lib/video.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import posixpath
 import shutil
 import tempfile
-from typing import Optional
+from typing import Optional, Union
 from numpy import ndarray
-from datachain.lib.file import FileError, ImageFile, Video, VideoFile
+from datachain.lib.file import File, FileError, ImageFile, Video, VideoFile
 try:
     import ffmpeg
@@ -18,7 +18,7 @@ except ImportError as exc:
     ) from exc
-def video_info(file: VideoFile) -> Video:
+def video_info(file: Union[File, VideoFile]) -> Video:
     """
     Returns video file information.
@@ -28,6 +28,8 @@ def video_info(file: VideoFile) -> Video:
     Returns:
         Video: Video file information.
     """
+    file = file.as_video_file()
     if not (file_path := file.get_local_path()):
         file.ensure_cached()
         file_path = file.get_local_path()
@@ -170,7 +172,7 @@ def save_video_frame(
     output_file = posixpath.join(
         output, f"{video.get_file_stem()}_{frame:04d}.{format}"
     )
-    return ImageFile.upload(img, output_file)
+    return ImageFile.upload(img, output_file, catalog=video._catalog)
 def save_video_fragment(
@@ -218,6 +220,6 @@ def save_video_fragment(
         ).output(output_file_tmp).run(quiet=True)
         with open(output_file_tmp, "rb") as f:
-            return VideoFile.upload(f.read(), output_file)
+            return VideoFile.upload(f.read(), output_file, catalog=video._catalog)
     finally:
         shutil.rmtree(temp_dir)

datachain 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

datachain 0.11.0py3-none-any.whl → 0.12.0py3-none-any.whl