PyPI - datachain - Versions diffs - 0.11.0__py3-none-any.whl → 0.11.11__py3-none-any.whl - Mend

datachain 0.11.0py3-none-any.whl → 0.11.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (15) hide show

datachain/cli/__init__.py +1 -0
datachain/cli/commands/show.py +12 -1
datachain/cli/parser/utils.py +6 -0
datachain/lib/data_model.py +6 -0
datachain/lib/dc.py +91 -20
datachain/lib/file.py +47 -10
datachain/lib/signal_schema.py +194 -15
datachain/nodes_thread_pool.py +32 -11
datachain/utils.py +3 -0
{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/METADATA +4 -4
{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/RECORD +15 -15
{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/LICENSE +0 -0
{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/WHEEL +0 -0
{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/entry_points.txt +0 -0
{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/top_level.txt +0 -0

datachain/cli/__init__.py CHANGED Viewed

@@ -215,6 +215,7 @@ def handle_show_command(args, catalog):
         columns=args.columns,
         no_collapse=args.no_collapse,
         schema=args.schema,
+        include_hidden=args.hidden,
     )

datachain/cli/commands/show.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from collections.abc import Sequence
 from typing import TYPE_CHECKING, Optional
+from datachain.lib.signal_schema import SignalSchema
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
@@ -14,6 +16,7 @@ def show(
     columns: Sequence[str] = (),
     no_collapse: bool = False,
     schema: bool = False,
+    include_hidden: bool = False,
 ) -> None:
     from datachain import Session
     from datachain.lib.dc import DataChain
@@ -23,6 +26,13 @@ def show(
     dataset = catalog.get_dataset(name)
     dataset_version = dataset.get_version(version or dataset.latest_version)
+    if include_hidden:
+        hidden_fields = []
+    else:
+        hidden_fields = SignalSchema.get_flatten_hidden_fields(
+            dataset_version.feature_schema
+        )
     query = (
         DatasetQuery(name=name, version=version, catalog=catalog)
         .select(*columns)
@@ -30,7 +40,8 @@ def show(
         .offset(offset)
     )
     records = query.to_db_records()
-    show_records(records, collapse_columns=not no_collapse)
+    show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
     if schema and dataset_version.feature_schema:
         print("\nSchema:")
         session = Session.get(catalog=catalog)

datachain/cli/parser/utils.py CHANGED Viewed

@@ -98,3 +98,9 @@ def add_show_args(parser: ArgumentParser) -> None:
         default=False,
         help="Do not collapse the columns",
     )
+    parser.add_argument(
+        "--hidden",
+        action="store_true",
+        default=False,
+        help="Show hidden fields",
+    )

datachain/lib/data_model.py CHANGED Viewed

@@ -26,6 +26,7 @@ class DataModel(BaseModel):
     """Pydantic model wrapper that registers model with `DataChain`."""
     _version: ClassVar[int] = 1
+    _hidden_fields: ClassVar[list[str]] = []
     @classmethod
     def __pydantic_init_subclass__(cls):
@@ -41,6 +42,11 @@ class DataModel(BaseModel):
         for val in models:
             ModelStore.register(val)
+    @classmethod
+    def hidden_fields(cls) -> list[str]:
+        """Returns a list of fields that should be hidden from the user."""
+        return cls._hidden_fields
 def is_chain_type(t: type) -> bool:
     """Return true if type is supported by `DataChain`."""

datachain/lib/dc.py CHANGED Viewed

@@ -23,6 +23,7 @@ import sqlalchemy
 from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
 from sqlalchemy.sql.sqltypes import NullType
+from tqdm import tqdm
 from datachain.dataset import DatasetRecord
 from datachain.func import literal
@@ -32,7 +33,14 @@ from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
-from datachain.lib.file import ArrowRow, File, FileType, get_file_type
+from datachain.lib.file import (
+    EXPORT_FILES_MAX_THREADS,
+    ArrowRow,
+    File,
+    FileExporter,
+    FileType,
+    get_file_type,
+)
 from datachain.lib.file import ExportPlacement as FileExportPlacement
 from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
 from datachain.lib.listing_info import ListingInfo
@@ -65,7 +73,6 @@ _T = TypeVar("_T")
 D = TypeVar("D", bound="DataChain")
 UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
 DEFAULT_PARQUET_CHUNK_SIZE = 100_000
@@ -1050,7 +1057,7 @@ class DataChain:
     def select(self, *args: str, _sys: bool = True) -> "Self":
         """Select only a specified set of signals."""
         new_schema = self.signals_schema.resolve(*args)
-        if _sys:
+        if self._sys and _sys:
             new_schema = SignalSchema({"sys": Sys}) | new_schema
         columns = new_schema.db_signals()
         return self._evolve(
@@ -1093,6 +1100,7 @@ class DataChain:
         partition_by_columns: list[Column] = []
         signal_columns: list[Column] = []
         schema_fields: dict[str, DataType] = {}
+        keep_columns: list[str] = []
         # validate partition_by columns and add them to the schema
         for col in partition_by:
@@ -1100,10 +1108,13 @@ class DataChain:
                 col_db_name = ColumnMeta.to_db_name(col)
                 col_type = self.signals_schema.get_column_type(col_db_name)
                 column = Column(col_db_name, python_to_sql(col_type))
+                if col not in keep_columns:
+                    keep_columns.append(col)
             elif isinstance(col, Function):
                 column = col.get_column(self.signals_schema)
                 col_db_name = column.name
                 col_type = column.type.python_type
+                schema_fields[col_db_name] = col_type
             else:
                 raise DataChainColumnError(
                     col,
@@ -1113,7 +1124,6 @@ class DataChain:
                     ),
                 )
             partition_by_columns.append(column)
-            schema_fields[col_db_name] = col_type
         # validate signal columns and add them to the schema
         if not kwargs:
@@ -1128,9 +1138,13 @@ class DataChain:
             signal_columns.append(column)
             schema_fields[col_name] = func.get_result_type(self.signals_schema)
+        signal_schema = SignalSchema(schema_fields)
+        if keep_columns:
+            signal_schema |= self.signals_schema.to_partial(*keep_columns)
         return self._evolve(
             query=self._query.group_by(signal_columns, partition_by_columns),
-            signal_schema=SignalSchema(schema_fields),
+            signal_schema=signal_schema,
         )
     def mutate(self, **kwargs) -> "Self":
@@ -1225,23 +1239,37 @@ class DataChain:
     @overload
     def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
+    @overload
+    def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
     @overload
     def collect_flatten(
         self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
     ) -> Iterator[_T]: ...
-    def collect_flatten(self, *, row_factory=None):
+    @overload
+    def collect_flatten(
+        self,
+        *,
+        row_factory: Callable[[list[str], tuple[Any, ...]], _T],
+        include_hidden: bool,
+    ) -> Iterator[_T]: ...
+    def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
         """Yields flattened rows of values as a tuple.
         Args:
             row_factory : A callable to convert row to a custom format.
                           It should accept two arguments: a list of column names and
                           a tuple of row values.
+            include_hidden: Whether to include hidden signals from the schema.
         """
-        db_signals = self._effective_signals_schema.db_signals()
+        db_signals = self._effective_signals_schema.db_signals(
+            include_hidden=include_hidden
+        )
         with self._query.ordered_select(*db_signals).as_iterable() as rows:
             if row_factory:
-                rows = (row_factory(db_signals, r) for r in rows)
+                rows = (row_factory(db_signals, r) for r in rows)  # type: ignore[assignment]
             yield from rows
     def to_columnar_data_with_names(
@@ -1275,10 +1303,23 @@ class DataChain:
         self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
     ) -> list[_T]: ...
-    def results(self, *, row_factory=None):  # noqa: D102
+    @overload
+    def results(
+        self,
+        *,
+        row_factory: Callable[[list[str], tuple[Any, ...]], _T],
+        include_hidden: bool,
+    ) -> list[_T]: ...
+    @overload
+    def results(self, *, include_hidden: bool) -> list[tuple[Any, ...]]: ...
+    def results(self, *, row_factory=None, include_hidden=True):  # noqa: D102
         if row_factory is None:
-            return list(self.collect_flatten())
-        return list(self.collect_flatten(row_factory=row_factory))
+            return list(self.collect_flatten(include_hidden=include_hidden))
+        return list(
+            self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
+        )
     def to_records(self) -> list[dict[str, Any]]:
         """Convert every row to a dictionary."""
@@ -1788,21 +1829,25 @@ class DataChain:
             **fr_map,
         )
-    def to_pandas(self, flatten=False) -> "pd.DataFrame":
+    def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
         """Return a pandas DataFrame from the chain.
         Parameters:
             flatten : Whether to use a multiindex or flatten column names.
+            include_hidden : Whether to include hidden columns.
         """
         import pandas as pd
-        headers, max_length = self._effective_signals_schema.get_headers_with_length()
+        headers, max_length = self._effective_signals_schema.get_headers_with_length(
+            include_hidden=include_hidden
+        )
         if flatten or max_length < 2:
             columns = [".".join(filter(None, header)) for header in headers]
         else:
             columns = pd.MultiIndex.from_tuples(map(tuple, headers))
-        return pd.DataFrame.from_records(self.results(), columns=columns)
+        results = self.results(include_hidden=include_hidden)
+        return pd.DataFrame.from_records(results, columns=columns)
     def show(
         self,
@@ -1810,6 +1855,7 @@ class DataChain:
         flatten=False,
         transpose=False,
         truncate=True,
+        include_hidden=False,
     ) -> None:
         """Show a preview of the chain results.
@@ -1818,11 +1864,12 @@ class DataChain:
             flatten : Whether to use a multiindex or flatten column names.
             transpose : Whether to transpose rows and columns.
             truncate : Whether or not to truncate the contents of columns.
+            include_hidden : Whether to include hidden columns.
         """
         import pandas as pd
         dc = self.limit(limit) if limit > 0 else self  # type: ignore[misc]
-        df = dc.to_pandas(flatten)
+        df = dc.to_pandas(flatten, include_hidden=include_hidden)
         if df.empty:
             print("Empty result")
@@ -2498,19 +2545,25 @@ class DataChain:
         output: str,
         signal: str = "file",
         placement: FileExportPlacement = "fullpath",
-        use_cache: bool = True,
         link_type: Literal["copy", "symlink"] = "copy",
+        num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
+        anon: bool = False,
+        client_config: Optional[dict] = None,
     ) -> None:
-        """Export files from a specified signal to a directory.
+        """Export files from a specified signal to a directory. Files can be
+        exported to a local or cloud directory.
         Args:
             output: Path to the target directory for exporting files.
             signal: Name of the signal to export files from.
             placement: The method to use for naming exported files.
                 The possible values are: "filename", "etag", "fullpath", and "checksum".
-            use_cache: If `True`, cache the files before exporting.
             link_type: Method to use for exporting files.
                 Falls back to `'copy'` if symlinking fails.
+            num_threads : number of threads to use for exporting files.
+                By default it uses 5 threads.
+            anon: If true, we will treat cloud bucket as public one
+            client_config: Optional configuration for the destination storage client
         Example:
             Cross cloud transfer
@@ -2525,8 +2578,26 @@ class DataChain:
         ):
             raise ValueError("Files with the same name found")
-        for file in self.collect(signal):
-            file.export(output, placement, use_cache, link_type=link_type)  # type: ignore[union-attr]
+        if anon:
+            client_config = (client_config or {}) | {"anon": True}
+        progress_bar = tqdm(
+            desc=f"Exporting files to {output}: ",
+            unit=" files",
+            unit_scale=True,
+            unit_divisor=10,
+            total=self.count(),
+            leave=False,
+        )
+        file_exporter = FileExporter(
+            output,
+            placement,
+            self._settings.cache if self._settings else False,
+            link_type,
+            max_threads=num_threads or 1,
+            client_config=client_config,
+        )
+        file_exporter.run(self.collect(signal), progress_bar)
     def shuffle(self) -> "Self":
         """Shuffle the rows of the chain deterministically."""

datachain/lib/file.py CHANGED Viewed

@@ -24,6 +24,7 @@ from pydantic import Field, field_validator
 from datachain.client.fileslice import FileSlice
 from datachain.lib.data_model import DataModel
 from datachain.lib.utils import DataChainError
+from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.sql.types import JSON, Boolean, DateTime, Int, String
 from datachain.utils import TIME_ZERO
@@ -43,6 +44,41 @@ logger = logging.getLogger("datachain")
 ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
 FileType = Literal["binary", "text", "image", "video"]
+EXPORT_FILES_MAX_THREADS = 5
+class FileExporter(NodesThreadPool):
+    """Class that does file exporting concurrently with thread pool"""
+    def __init__(
+        self,
+        output: str,
+        placement: ExportPlacement,
+        use_cache: bool,
+        link_type: Literal["copy", "symlink"],
+        max_threads: int = EXPORT_FILES_MAX_THREADS,
+        client_config: Optional[dict] = None,
+    ):
+        super().__init__(max_threads)
+        self.output = output
+        self.placement = placement
+        self.use_cache = use_cache
+        self.link_type = link_type
+        self.client_config = client_config
+    def done_task(self, done):
+        for task in done:
+            task.result()
+    def do_task(self, file):
+        file.export(
+            self.output,
+            self.placement,
+            self.use_cache,
+            link_type=self.link_type,
+            client_config=self.client_config,
+        )
+        self.increase_counter(1)
 class VFileError(DataChainError):
@@ -158,6 +194,7 @@ class File(DataModel):
         "last_modified": DateTime,
         "location": JSON,
     }
+    _hidden_fields: ClassVar[list[str]] = ["version", "source"]
     _unique_id_keys: ClassVar[list[str]] = [
         "source",
@@ -269,10 +306,10 @@ class File(DataModel):
         with self.open(mode="r") as stream:
             return stream.read()
-    def save(self, destination: str):
+    def save(self, destination: str, client_config: Optional[dict] = None):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
-        client: Client = self._catalog.get_client(destination)
+        client: Client = self._catalog.get_client(destination, **(client_config or {}))
         if client.PREFIX == "file://" and not destination.startswith(client.PREFIX):
             destination = Path(destination).absolute().as_uri()
@@ -300,13 +337,13 @@ class File(DataModel):
         placement: ExportPlacement = "fullpath",
         use_cache: bool = True,
         link_type: Literal["copy", "symlink"] = "copy",
+        client_config: Optional[dict] = None,
     ) -> None:
         """Export file to new location."""
-        if use_cache:
-            self._caching_enabled = use_cache
+        self._caching_enabled = use_cache
         dst = self.get_destination_path(output, placement)
         dst_dir = os.path.dirname(dst)
-        client: Client = self._catalog.get_client(dst_dir)
+        client: Client = self._catalog.get_client(dst_dir, **(client_config or {}))
         client.fs.makedirs(dst_dir, exist_ok=True)
         if link_type == "symlink":
@@ -316,7 +353,7 @@ class File(DataModel):
                 if exc.errno not in (errno.ENOTSUP, errno.EXDEV, errno.ENOSYS):
                     raise
-        self.save(dst)
+        self.save(dst, client_config=client_config)
     def _set_stream(
         self,
@@ -502,11 +539,11 @@ class TextFile(File):
         with self.open() as stream:
             return stream.read()
-    def save(self, destination: str):
+    def save(self, destination: str, client_config: Optional[dict] = None):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
-        client: Client = self._catalog.get_client(destination)
+        client: Client = self._catalog.get_client(destination, **(client_config or {}))
         with client.fs.open(destination, mode="w") as f:
             f.write(self.read_text())
@@ -519,11 +556,11 @@ class ImageFile(File):
         fobj = super().read()
         return PilImage.open(BytesIO(fobj))
-    def save(self, destination: str):
+    def save(self, destination: str, client_config: Optional[dict] = None):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
-        client: Client = self._catalog.get_client(destination)
+        client: Client = self._catalog.get_client(destination, **(client_config or {}))
         with client.fs.open(destination, mode="wb") as f:
             self.read().save(f)

datachain/lib/signal_schema.py CHANGED Viewed

@@ -91,6 +91,7 @@ class CustomType(BaseModel):
     name: str
     fields: dict[str, str]
     bases: list[tuple[str, str, Optional[str]]]
+    hidden_fields: Optional[list[str]] = None
     @classmethod
     def deserialize(cls, data: dict[str, Any], type_name: str) -> "CustomType":
@@ -102,6 +103,7 @@ class CustomType(BaseModel):
                 "name": type_name,
                 "fields": data,
                 "bases": [],
+                "hidden_fields": [],
             }
         return cls(**data)
@@ -179,6 +181,16 @@ class SignalSchema:
                 )
         return SignalSchema(signals)
+    @staticmethod
+    def _get_bases(fr: type) -> list[tuple[str, str, Optional[str]]]:
+        bases: list[tuple[str, str, Optional[str]]] = []
+        for base in fr.__mro__:
+            model_store_name = (
+                ModelStore.get_name(base) if issubclass(base, DataModel) else None
+            )
+            bases.append((base.__name__, base.__module__, model_store_name))
+        return bases
     @staticmethod
     def _serialize_custom_model(
         version_name: str, fr: type[BaseModel], custom_types: dict[str, Any]
@@ -196,14 +208,15 @@ class SignalSchema:
             assert field_type
             fields[field_name] = SignalSchema._serialize_type(field_type, custom_types)
-        bases: list[tuple[str, str, Optional[str]]] = []
-        for type_ in fr.__mro__:
-            model_store_name = (
-                ModelStore.get_name(type_) if issubclass(type_, DataModel) else None
-            )
-            bases.append((type_.__name__, type_.__module__, model_store_name))
+        bases = SignalSchema._get_bases(fr)
-        ct = CustomType(schema_version=2, name=version_name, fields=fields, bases=bases)
+        ct = CustomType(
+            schema_version=2,
+            name=version_name,
+            fields=fields,
+            bases=bases,
+            hidden_fields=getattr(fr, "_hidden_fields", []),
+        )
         custom_types[version_name] = ct.model_dump()
         return version_name
@@ -384,6 +397,37 @@ class SignalSchema:
         return SignalSchema(signals)
+    @staticmethod
+    def get_flatten_hidden_fields(schema):
+        custom_types = schema.get("_custom_types", {})
+        if not custom_types:
+            return []
+        hidden_by_types = {
+            name: schema.get("hidden_fields", [])
+            for name, schema in custom_types.items()
+        }
+        hidden_fields = []
+        def traverse(prefix, schema_info):
+            for field, field_type in schema_info.items():
+                if field == "_custom_types":
+                    continue
+                if field_type in custom_types:
+                    hidden_fields.extend(
+                        f"{prefix}{field}__{f}" for f in hidden_by_types[field_type]
+                    )
+                    traverse(
+                        prefix + field + "__",
+                        custom_types[field_type].get("fields", {}),
+                    )
+        traverse("", schema)
+        return hidden_fields
     def to_udf_spec(self) -> dict[str, type]:
         res = {}
         for path, type_, has_subtree, _ in self.get_flat_tree():
@@ -479,7 +523,7 @@ class SignalSchema:
         raise SignalResolvingError([col_name], "is not found")
     def db_signals(
-        self, name: Optional[str] = None, as_columns=False
+        self, name: Optional[str] = None, as_columns=False, include_hidden: bool = True
     ) -> Union[list[str], list[Column]]:
         """
         Returns DB columns as strings or Column objects with proper types
@@ -489,7 +533,9 @@ class SignalSchema:
             DEFAULT_DELIMITER.join(path)
             if not as_columns
             else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
-            for path, _type, has_subtree, _ in self.get_flat_tree()
+            for path, _type, has_subtree, _ in self.get_flat_tree(
+                include_hidden=include_hidden
+            )
             if not has_subtree
         ]
@@ -624,19 +670,31 @@ class SignalSchema:
             for name, val in values.items()
         }
-    def get_flat_tree(self) -> Iterator[tuple[list[str], DataType, bool, int]]:
-        yield from self._get_flat_tree(self.tree, [], 0)
+    def get_flat_tree(
+        self, include_hidden: bool = True
+    ) -> Iterator[tuple[list[str], DataType, bool, int]]:
+        yield from self._get_flat_tree(self.tree, [], 0, include_hidden)
     def _get_flat_tree(
-        self, tree: dict, prefix: list[str], depth: int
+        self, tree: dict, prefix: list[str], depth: int, include_hidden: bool
     ) -> Iterator[tuple[list[str], DataType, bool, int]]:
         for name, (type_, substree) in tree.items():
             suffix = name.split(".")
             new_prefix = prefix + suffix
+            hidden_fields = getattr(type_, "_hidden_fields", None)
+            if hidden_fields and substree and not include_hidden:
+                substree = {
+                    field: info
+                    for field, info in substree.items()
+                    if field not in hidden_fields
+                }
             has_subtree = substree is not None
             yield new_prefix, type_, has_subtree, depth
             if substree is not None:
-                yield from self._get_flat_tree(substree, new_prefix, depth + 1)
+                yield from self._get_flat_tree(
+                    substree, new_prefix, depth + 1, include_hidden
+                )
     def print_tree(self, indent: int = 4, start_at: int = 0):
         for path, type_, _, depth in self.get_flat_tree():
@@ -649,9 +707,13 @@ class SignalSchema:
                     sub_schema = SignalSchema({"* list of": args[0]})
                     sub_schema.print_tree(indent=indent, start_at=total_indent + indent)
-    def get_headers_with_length(self):
+    def get_headers_with_length(self, include_hidden: bool = True):
         paths = [
-            path for path, _, has_subtree, _ in self.get_flat_tree() if not has_subtree
+            path
+            for path, _, has_subtree, _ in self.get_flat_tree(
+                include_hidden=include_hidden
+            )
+            if not has_subtree
         ]
         max_length = max([len(path) for path in paths], default=0)
         return [
@@ -749,3 +811,120 @@ class SignalSchema:
             res[name] = (anno, subtree)  # type: ignore[assignment]
         return res
+    def to_partial(self, *columns: str) -> "SignalSchema":
+        """
+        Convert the schema to a partial schema with only the specified columns.
+        E.g. if original schema is:
+            ```
+            signal: Foo@v1
+                name: str
+                value: float
+            count: int
+            ```
+        Then `to_partial("signal.name", "count")` will return a partial schema:
+            ```
+            signal: FooPartial@v1
+                name: str
+            count: int
+            ```
+        Note that partial schema will have a different name for the custom types
+        (e.g. `FooPartial@v1` instead of `Foo@v1`) to avoid conflicts
+        with the original schema.
+        Args:
+            *columns (str): The columns to include in the partial schema.
+        Returns:
+            SignalSchema: The new partial schema.
+        """
+        serialized = self.serialize()
+        custom_types = serialized.get("_custom_types", {})
+        schema: dict[str, Any] = {}
+        schema_custom_types: dict[str, CustomType] = {}
+        data_model_bases: Optional[list[tuple[str, str, Optional[str]]]] = None
+        signal_partials: dict[str, str] = {}
+        partial_versions: dict[str, int] = {}
+        def _type_name_to_partial(signal_name: str, type_name: str) -> str:
+            if "@" not in type_name:
+                return type_name
+            model_name, _ = ModelStore.parse_name_version(type_name)
+            if signal_name not in signal_partials:
+                partial_versions.setdefault(model_name, 0)
+                partial_versions[model_name] += 1
+                version = partial_versions[model_name]
+                signal_partials[signal_name] = f"{model_name}Partial{version}"
+            return signal_partials[signal_name]
+        for column in columns:
+            parent_type, parent_type_partial = "", ""
+            column_parts = column.split(".")
+            for i, signal in enumerate(column_parts):
+                if i == 0:
+                    if signal not in serialized:
+                        raise SignalSchemaError(
+                            f"Column {column} not found in the schema"
+                        )
+                    parent_type = serialized[signal]
+                    parent_type_partial = _type_name_to_partial(signal, parent_type)
+                    schema[signal] = parent_type_partial
+                    continue
+                if parent_type not in custom_types:
+                    raise SignalSchemaError(
+                        f"Custom type {parent_type} not found in the schema"
+                    )
+                custom_type = custom_types[parent_type]
+                signal_type = custom_type["fields"].get(signal)
+                if not signal_type:
+                    raise SignalSchemaError(
+                        f"Field {signal} not found in custom type {parent_type}"
+                    )
+                partial_type = _type_name_to_partial(
+                    ".".join(column_parts[: i + 1]),
+                    signal_type,
+                )
+                if parent_type_partial in schema_custom_types:
+                    schema_custom_types[parent_type_partial].fields[signal] = (
+                        partial_type
+                    )
+                else:
+                    if data_model_bases is None:
+                        data_model_bases = SignalSchema._get_bases(DataModel)
+                    partial_type_name, _ = ModelStore.parse_name_version(partial_type)
+                    schema_custom_types[parent_type_partial] = CustomType(
+                        schema_version=2,
+                        name=partial_type_name,
+                        fields={signal: partial_type},
+                        bases=[
+                            (partial_type_name, "__main__", partial_type),
+                            *data_model_bases,
+                        ],
+                    )
+                parent_type, parent_type_partial = signal_type, partial_type
+        if schema_custom_types:
+            schema["_custom_types"] = {
+                type_name: ct.model_dump()
+                for type_name, ct in schema_custom_types.items()
+            }
+        return SignalSchema.deserialize(schema)

datachain/nodes_thread_pool.py CHANGED Viewed

@@ -57,6 +57,9 @@ class NodesThreadPool(ABC):
         self._max_threads = max_threads
         self._thread_counter = 0
         self._thread_lock = threading.Lock()
+        self.tasks = set()
+        self.canceled = False
+        self.th_pool = None
     def run(
         self,
@@ -64,37 +67,55 @@ class NodesThreadPool(ABC):
         progress_bar=None,
     ):
         results = []
-        with concurrent.futures.ThreadPoolExecutor(self._max_threads) as th_pool:
-            tasks = set()
+        self.th_pool = concurrent.futures.ThreadPoolExecutor(self._max_threads)
+        try:
             self._thread_counter = 0
             for chunk in chunk_gen:
-                while len(tasks) >= self._max_threads:
+                if self.canceled:
+                    break
+                while len(self.tasks) >= self._max_threads:
                     done, _ = concurrent.futures.wait(
-                        tasks, timeout=1, return_when="FIRST_COMPLETED"
+                        self.tasks, timeout=1, return_when="FIRST_COMPLETED"
                     )
                     self.done_task(done)
-                    tasks = tasks - done
+                    self.tasks = self.tasks - done
                     self.update_progress_bar(progress_bar)
-                tasks.add(th_pool.submit(self.do_task, chunk))
+                self.tasks.add(self.th_pool.submit(self.do_task, chunk))
                 self.update_progress_bar(progress_bar)
-            while tasks:
+            while self.tasks:
+                if self.canceled:
+                    break
                 done, _ = concurrent.futures.wait(
-                    tasks, timeout=1, return_when="FIRST_COMPLETED"
+                    self.tasks, timeout=1, return_when="FIRST_COMPLETED"
                 )
                 task_results = self.done_task(done)
                 if task_results:
                     results.extend(task_results)
-                tasks = tasks - done
+                self.tasks = self.tasks - done
                 self.update_progress_bar(progress_bar)
-            th_pool.shutdown()
+        except:
+            self.cancel_all()
+            raise
+        else:
+            self.th_pool.shutdown()
         return results
+    def cancel_all(self):
+        self.cancel = True
+        # Canceling tasks just in case any of them is scheduled to run.
+        # Note that running tasks cannot be canceled, instead we will wait for
+        # them to finish when shutting down thread loop executor by calling
+        # shutdown() method.
+        for task in self.tasks:
+            task.cancel()
+        if self.th_pool:
+            self.th_pool.shutdown()  # this will wait for running tasks to finish
     def update_progress_bar(self, progress_bar):
         if progress_bar is not None:
             with self._thread_lock:

datachain/utils.py CHANGED Viewed

@@ -362,6 +362,7 @@ def show_records(
     records: Optional[list[dict]],
     collapse_columns: bool = False,
     system_columns: bool = False,
+    hidden_fields: Optional[list[str]] = None,
 ) -> None:
     import pandas as pd
@@ -369,6 +370,8 @@ def show_records(
         return
     df = pd.DataFrame.from_records(records)
+    if hidden_fields:
+        df = df.drop(columns=hidden_fields, errors="ignore")
     return show_df(df, collapse_columns=collapse_columns, system_columns=system_columns)

{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: datachain
-Version: 0.11.0
+Version: 0.11.11
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Development Status :: 2 - Pre-Alpha
 Requires-Python: >=3.9
 Description-Content-Type: text/x-rst
@@ -71,9 +72,8 @@ Provides-Extra: hf
 Requires-Dist: numba>=0.60.0; extra == "hf"
 Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
 Provides-Extra: video
-Requires-Dist: av<14; extra == "video"
 Requires-Dist: ffmpeg-python; extra == "video"
-Requires-Dist: imageio[ffmpeg]; extra == "video"
+Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
 Requires-Dist: opencv-python; extra == "video"
 Provides-Extra: tests
 Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
@@ -103,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
 Requires-Dist: defusedxml; extra == "examples"
 Requires-Dist: accelerate; extra == "examples"
 Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
-Requires-Dist: ultralytics==8.3.78; extra == "examples"
+Requires-Dist: ultralytics==8.3.82; extra == "examples"
 Requires-Dist: open_clip_torch; extra == "examples"
 ================

{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/RECORD RENAMED Viewed

@@ -9,18 +9,18 @@ datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
 datachain/listing.py,sha256=HNB-xeKA6aUA-HTWr--H22S6jVOxP2OVQ-3d07ISqAk,7109
 datachain/node.py,sha256=KWDT0ClYXB7FYI-QOvzAa-UDkLJErUI2eWm5FBteYuU,5577
 datachain/nodes_fetcher.py,sha256=_wgaKyqEjkqdwJ_Hj6D8vUYz7hnU7g6xhm0H6ZnYxmE,1095
-datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
+datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2SmM,3989
 datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
 datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
 datachain/studio.py,sha256=Coo_6murSjh-RypiHDWNsVXGmfsopyMPCpPS1sA6uUc,9844
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
-datachain/utils.py,sha256=n8fcyOM8P_2CEFK4h8BZxCAwCkOpt8NAeJK5tm1gIOg,14433
+datachain/utils.py,sha256=-vhV9LMUcUxDSBmyeJH4WJcfLTO416usD6igXS8c49k,14563
 datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
 datachain/catalog/catalog.py,sha256=xZC6drw4opoYcxTTiAFv6nbhNOzBb-UZZ_VqY9dqdIs,59458
 datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
 datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
-datachain/cli/__init__.py,sha256=Uu_ARR5-VS1srC_o2EADRjYKX1c86GK7LZCDL4ufE_w,8290
+datachain/cli/__init__.py,sha256=OLoDOYm7M23bLdMJhw3_GsJDGPl8pWYzcjpwgxEdFDs,8326
 datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
 datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
 datachain/cli/commands/datasets.py,sha256=865ui6q4UVPbL_-jk18C-lYi_bGMlh7XhfRaHbbNyhk,5796
@@ -29,11 +29,11 @@ datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR
 datachain/cli/commands/ls.py,sha256=Wb8hXyBwyhb62Zk6ZhNFPFrj2lJhdbRcnBQQkgL_qyw,5174
 datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibVE,600
 datachain/cli/commands/query.py,sha256=2S7hQxialt1fkbocxi6JXZI6jS5QnFrD1aOjKgZkzfI,1471
-datachain/cli/commands/show.py,sha256=RVb_7Kjd1kzqTxRKYFvmD04LaJHOtrCc4FYMyc-ZEYw,1149
+datachain/cli/commands/show.py,sha256=d-DDw4hA3TWA2vqIS-FkEXrzqvttcTdh2QPaahtLdy0,1445
 datachain/cli/parser/__init__.py,sha256=rtjlqSsDd4LZH9WdgvluO27M4sID1wD7YkQ4cKhNXzw,15721
 datachain/cli/parser/job.py,sha256=kvQkSfieyUmvJpOK8p78UgS8sygHhQXztRlOtVcgtaU,3449
 datachain/cli/parser/studio.py,sha256=4HEE1K93WDJxMLfgqAA4mHdigpSzC7SLUx-qPF0NgYQ,3254
-datachain/cli/parser/utils.py,sha256=GEzxfPJ4i6nt6JhjvZ3PQesXl9islEV3E-N1NZGrLaA,2750
+datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
 datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
 datachain/client/azure.py,sha256=ma6fJcnveG8wpNy1PSrN5hgvmRdCj8Sf3RKjfd3qCyM,3221
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
@@ -67,10 +67,10 @@ datachain/func/window.py,sha256=0MB1yjpVbwOrl_WNLZ8V3jkJz3o0XlYinpAcZQJuxiA,1688
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/arrow.py,sha256=9UBCF-lftQaz0yxdsjbLKbyzVSmrF_QSWdhp2oBDPqs,9486
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
-datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
+datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
 datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
-datachain/lib/dc.py,sha256=QQPnrS_OB1d3CfjLnYtRByGc7wNX_YT24WOjaoFPJgw,95372
-datachain/lib/file.py,sha256=Bbnb7JBiAFRD1RsZwPdvoiWFKHkl7V3haDLh672xTZg,27658
+datachain/lib/dc.py,sha256=XU4VmRjm7CR37YuEKMhtU_DGxb1a7agXoNVU5WsaLRc,97772
+datachain/lib/file.py,sha256=LwpRWsDvO3ZvUBAtS29mFotp_arfEy-HhPQ0jaL_2Rc,29006
 datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
 datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
 datachain/lib/listing.py,sha256=auodM0HitYZsL0DybdgQUYhne_LgkVW-LKGYYOACP90,7272
@@ -79,7 +79,7 @@ datachain/lib/meta_formats.py,sha256=hDPfEkcmiLZOjhBBXuareMdnq65Wj8vZvxjmum6cROM
 datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
 datachain/lib/pytorch.py,sha256=QxXBhrn2-D0RiFA2rdxZ7wKMxyuQ0WWHKfiFEWAA760,7710
 datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
-datachain/lib/signal_schema.py,sha256=ps5od6zhWtdX3Khx2fwArl2xlGkK8SKi6vCQ6QmbaR0,27404
+datachain/lib/signal_schema.py,sha256=WyVTXUsa4DVTIZRAX2-MdjOe4deat_Fufsd9n8ycrXQ,33629
 datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
 datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
 datachain/lib/udf.py,sha256=TlvikKTFvkIKaqqSkSriOyXhQ0rwRHV2ZRs1LHZOCmo,16107
@@ -136,9 +136,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
 datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.11.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.11.0.dist-info/METADATA,sha256=ijLSRDc7IAZe6YxdX0ZRRNY2LOUlsFFib660U_upu20,11241
-datachain-0.11.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-datachain-0.11.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.11.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.11.0.dist-info/RECORD,,
+datachain-0.11.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.11.11.dist-info/METADATA,sha256=iF194pmsP-vh7ITTJG62w-VbTQbWGDckY-GJfempDBg,11267
+datachain-0.11.11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+datachain-0.11.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.11.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.11.11.dist-info/RECORD,,

{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.11.0.dist-info → datachain-0.11.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.11.0__py3-none-any.whl → 0.11.11__py3-none-any.whl

Potentially problematic release.

datachain 0.11.0py3-none-any.whl → 0.11.11py3-none-any.whl