PyPI - datachain - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

datachain 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (27) hide show

datachain/catalog/catalog.py +11 -80
datachain/cli.py +6 -38
datachain/client/fsspec.py +3 -0
datachain/client/hf.py +47 -0
datachain/data_storage/metastore.py +2 -29
datachain/data_storage/sqlite.py +3 -12
datachain/data_storage/warehouse.py +20 -29
datachain/dataset.py +44 -32
datachain/lib/arrow.py +21 -5
datachain/lib/dataset_info.py +4 -0
datachain/lib/dc.py +108 -25
datachain/lib/file.py +10 -33
datachain/lib/hf.py +2 -1
datachain/lib/listing.py +102 -94
datachain/lib/listing_info.py +32 -0
datachain/lib/meta_formats.py +4 -4
datachain/lib/signal_schema.py +5 -2
datachain/node.py +13 -0
datachain/query/dataset.py +11 -81
datachain/query/metrics.py +8 -0
datachain/utils.py +5 -0
{datachain-0.3.9.dist-info → datachain-0.3.10.dist-info}/METADATA +3 -2
{datachain-0.3.9.dist-info → datachain-0.3.10.dist-info}/RECORD +27 -25
{datachain-0.3.9.dist-info → datachain-0.3.10.dist-info}/WHEEL +1 -1
{datachain-0.3.9.dist-info → datachain-0.3.10.dist-info}/LICENSE +0 -0
{datachain-0.3.9.dist-info → datachain-0.3.10.dist-info}/entry_points.txt +0 -0
{datachain-0.3.9.dist-info → datachain-0.3.10.dist-info}/top_level.txt +0 -0

datachain/lib/arrow.py CHANGED Viewed

@@ -7,7 +7,9 @@ import pyarrow as pa
 from pyarrow.dataset import dataset
 from tqdm import tqdm
+from datachain.lib.data_model import dict_to_data_model
 from datachain.lib.file import File, IndexedFile
+from datachain.lib.model_store import ModelStore
 from datachain.lib.udf import Generator
 if TYPE_CHECKING:
@@ -59,7 +61,13 @@ class ArrowGenerator(Generator):
                     vals = list(record.values())
                     if self.output_schema:
                         fields = self.output_schema.model_fields
-                        vals = [self.output_schema(**dict(zip(fields, vals)))]
+                        vals_dict = {}
+                        for (field, field_info), val in zip(fields.items(), vals):
+                            if ModelStore.is_pydantic(field_info.annotation):
+                                vals_dict[field] = field_info.annotation(**val)  # type: ignore[misc]
+                            else:
+                                vals_dict[field] = val
+                        vals = [self.output_schema(**vals_dict)]
                     if self.source:
                         yield [IndexedFile(file=file, index=index), *vals]
                     else:
@@ -95,15 +103,15 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
         if not column:
             column = f"c{default_column}"
             default_column += 1
-        dtype = arrow_type_mapper(field.type)  # type: ignore[assignment]
-        if field.nullable:
+        dtype = arrow_type_mapper(field.type, column)  # type: ignore[assignment]
+        if field.nullable and not ModelStore.is_pydantic(dtype):
             dtype = Optional[dtype]  # type: ignore[assignment]
         output[column] = dtype
     return output
-def arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
+def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa: PLR0911
     """Convert pyarrow types to basic types."""
     from datetime import datetime
@@ -123,7 +131,15 @@ def arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
         return str
     if pa.types.is_list(col_type):
         return list[arrow_type_mapper(col_type.value_type)]  # type: ignore[return-value, misc]
-    if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
+    if pa.types.is_struct(col_type):
+        type_dict = {}
+        for field in col_type:
+            dtype = arrow_type_mapper(field.type, field.name)
+            if field.nullable and not ModelStore.is_pydantic(dtype):
+                dtype = Optional[dtype]  # type: ignore[assignment]
+            type_dict[field.name] = dtype
+        return dict_to_data_model(column, type_dict)
+    if pa.types.is_map(col_type):
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):
         return arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]

datachain/lib/dataset_info.py CHANGED Viewed

@@ -23,6 +23,8 @@ class DatasetInfo(DataModel):
     size: Optional[int] = Field(default=None)
     params: dict[str, str] = Field(default=dict)
     metrics: dict[str, Any] = Field(default=dict)
+    error_message: str = Field(default="")
+    error_stack: str = Field(default="")
     @staticmethod
     def _validate_dict(
@@ -67,4 +69,6 @@ class DatasetInfo(DataModel):
             size=version.size,
             params=job.params if job else {},
             metrics=job.metrics if job else {},
+            error_message=version.error_message,
+            error_stack=version.error_stack,
         )

datachain/lib/dc.py CHANGED Viewed

@@ -27,7 +27,16 @@ from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import ExportPlacement as FileExportPlacement
-from datachain.lib.file import File, IndexedFile, get_file
+from datachain.lib.file import File, IndexedFile, get_file_type
+from datachain.lib.listing import (
+    is_listing_dataset,
+    is_listing_expired,
+    is_listing_subset,
+    list_bucket,
+    ls,
+    parse_listing_uri,
+)
+from datachain.lib.listing_info import ListingInfo
 from datachain.lib.meta_formats import read_meta, read_schema
 from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
@@ -311,7 +320,7 @@ class DataChain(DatasetQuery):
     @classmethod
     def from_storage(
         cls,
-        path,
+        uri,
         *,
         type: Literal["binary", "text", "image"] = "binary",
         session: Optional[Session] = None,
@@ -320,41 +329,73 @@ class DataChain(DatasetQuery):
         recursive: Optional[bool] = True,
         object_name: str = "file",
         update: bool = False,
-        **kwargs,
+        anon: bool = False,
     ) -> "Self":
         """Get data from a storage as a list of file with all file attributes.
         It returns the chain itself as usual.
         Parameters:
-            path : storage URI with directory. URI must start with storage prefix such
+            uri : storage URI with directory. URI must start with storage prefix such
                 as `s3://`, `gs://`, `az://` or "file:///"
             type : read file as "binary", "text", or "image" data. Default is "binary".
             recursive : search recursively for the given path.
             object_name : Created object column name.
             update : force storage reindexing. Default is False.
+            anon : If True, we will treat cloud bucket as public one
         Example:
             ```py
             chain = DataChain.from_storage("s3://my-bucket/my-dir")
             ```
         """
-        func = get_file(type)
-        return (
-            cls(
-                path,
-                session=session,
-                settings=settings,
-                recursive=recursive,
-                update=update,
-                in_memory=in_memory,
-                **kwargs,
-            )
-            .map(**{object_name: func})
-            .select(object_name)
+        file_type = get_file_type(type)
+        client_config = {"anon": True} if anon else None
+        session = Session.get(session, client_config=client_config, in_memory=in_memory)
+        list_dataset_name, list_uri, list_path = parse_listing_uri(
+            uri, session.catalog.cache, session.catalog.client_config
         )
+        need_listing = True
+        for ds in cls.listings(session=session, in_memory=in_memory).collect("listing"):
+            if (
+                not is_listing_expired(ds.created_at)  # type: ignore[union-attr]
+                and is_listing_subset(ds.name, list_dataset_name)  # type: ignore[union-attr]
+                and not update
+            ):
+                need_listing = False
+                list_dataset_name = ds.name  # type: ignore[union-attr]
+        if need_listing:
+            # caching new listing to special listing dataset
+            (
+                cls.from_records(
+                    DataChain.DEFAULT_FILE_RECORD,
+                    session=session,
+                    settings=settings,
+                    in_memory=in_memory,
+                )
+                .gen(
+                    list_bucket(list_uri, client_config=session.catalog.client_config),
+                    output={f"{object_name}": File},
+                )
+                .save(list_dataset_name, listing=True)
+            )
+        dc = cls.from_dataset(list_dataset_name, session=session)
+        dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+        return ls(dc, list_path, recursive=recursive, object_name=object_name)
     @classmethod
-    def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
+    def from_dataset(
+        cls,
+        name: str,
+        version: Optional[int] = None,
+        session: Optional[Session] = None,
+    ) -> "DataChain":
         """Get data from a saved Dataset. It returns the chain itself.
         Parameters:
@@ -366,7 +407,7 @@ class DataChain(DatasetQuery):
             chain = DataChain.from_dataset("my_cats")
             ```
         """
-        return DataChain(name=name, version=version)
+        return DataChain(name=name, version=version, session=session)
     @classmethod
     def from_json(
@@ -419,7 +460,7 @@ class DataChain(DatasetQuery):
             object_name = jmespath_to_name(jmespath)
         if not object_name:
             object_name = meta_type
-        chain = DataChain.from_storage(path=path, type=type, **kwargs)
+        chain = DataChain.from_storage(uri=path, type=type, **kwargs)
         signal_dict = {
             object_name: read_meta(
                 schema_from=schema_from,
@@ -479,7 +520,7 @@ class DataChain(DatasetQuery):
             object_name = jmespath_to_name(jmespath)
         if not object_name:
             object_name = meta_type
-        chain = DataChain.from_storage(path=path, type=type, **kwargs)
+        chain = DataChain.from_storage(uri=path, type=type, **kwargs)
         signal_dict = {
             object_name: read_meta(
                 schema_from=schema_from,
@@ -500,6 +541,7 @@ class DataChain(DatasetQuery):
         settings: Optional[dict] = None,
         in_memory: bool = False,
         object_name: str = "dataset",
+        include_listing: bool = False,
     ) -> "DataChain":
         """Generate chain with list of registered datasets.
@@ -517,7 +559,9 @@ class DataChain(DatasetQuery):
         datasets = [
             DatasetInfo.from_models(d, v, j)
-            for d, v, j in catalog.list_datasets_versions()
+            for d, v, j in catalog.list_datasets_versions(
+                include_listing=include_listing
+            )
         ]
         return cls.from_values(
@@ -528,6 +572,42 @@ class DataChain(DatasetQuery):
             **{object_name: datasets},  # type: ignore[arg-type]
         )
+    @classmethod
+    def listings(
+        cls,
+        session: Optional[Session] = None,
+        in_memory: bool = False,
+        object_name: str = "listing",
+        **kwargs,
+    ) -> "DataChain":
+        """Generate chain with list of cached listings.
+        Listing is a special kind of dataset which has directory listing data of
+        some underlying storage (e.g S3 bucket).
+        Example:
+            ```py
+            from datachain import DataChain
+            DataChain.listings().show()
+            ```
+        """
+        session = Session.get(session, in_memory=in_memory)
+        catalog = kwargs.get("catalog") or session.catalog
+        listings = [
+            ListingInfo.from_models(d, v, j)
+            for d, v, j in catalog.list_datasets_versions(
+                include_listing=True, **kwargs
+            )
+            if is_listing_dataset(d.name)
+        ]
+        return cls.from_values(
+            session=session,
+            in_memory=in_memory,
+            output={object_name: ListingInfo},
+            **{object_name: listings},  # type: ignore[arg-type]
+        )
     def print_json_schema(  # type: ignore[override]
         self, jmespath: Optional[str] = None, model_name: Optional[str] = None
     ) -> "Self":
@@ -570,7 +650,7 @@ class DataChain(DatasetQuery):
         )
     def save(  # type: ignore[override]
-        self, name: Optional[str] = None, version: Optional[int] = None
+        self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
     ) -> "Self":
         """Save to a Dataset. It returns the chain itself.
@@ -580,7 +660,7 @@ class DataChain(DatasetQuery):
             version : version of a dataset. Default - the last version that exist.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
-        return super().save(name=name, version=version, feature_schema=schema)
+        return super().save(name=name, version=version, feature_schema=schema, **kwargs)
     def apply(self, func, *args, **kwargs):
         """Apply any function to the chain.
@@ -1665,7 +1745,10 @@ class DataChain(DatasetQuery):
         if schema:
             signal_schema = SignalSchema(schema)
-            columns = signal_schema.db_signals(as_columns=True)  # type: ignore[assignment]
+            columns = [
+                sqlalchemy.Column(c.name, c.type)  # type: ignore[union-attr]
+                for c in signal_schema.db_signals(as_columns=True)  # type: ignore[assignment]
+            ]
         else:
             columns = [
                 sqlalchemy.Column(name, typ)

datachain/lib/file.py CHANGED Viewed

@@ -349,39 +349,6 @@ class ImageFile(File):
         self.read().save(destination)
-def get_file(type_: Literal["binary", "text", "image"] = "binary"):
-    file: type[File] = File
-    if type_ == "text":
-        file = TextFile
-    elif type_ == "image":
-        file = ImageFile  # type: ignore[assignment]
-    def get_file_type(
-        source: str,
-        path: str,
-        size: int,
-        version: str,
-        etag: str,
-        is_latest: bool,
-        last_modified: datetime,
-        location: Optional[Union[dict, list[dict]]],
-        vtype: str,
-    ) -> file:  # type: ignore[valid-type]
-        return file(
-            source=source,
-            path=path,
-            size=size,
-            version=version,
-            etag=etag,
-            is_latest=is_latest,
-            last_modified=last_modified,
-            location=location,
-            vtype=vtype,
-        )
-    return get_file_type
 class IndexedFile(DataModel):
     """Metadata indexed from tabular files.
@@ -390,3 +357,13 @@ class IndexedFile(DataModel):
     file: File
     index: int
+def get_file_type(type_: Literal["binary", "text", "image"] = "binary") -> type[File]:
+    file: type[File] = File
+    if type_ == "text":
+        file = TextFile
+    elif type_ == "image":
+        file = ImageFile  # type: ignore[assignment]
+    return file

datachain/lib/hf.py CHANGED Viewed

@@ -99,7 +99,8 @@ class HFGenerator(Generator):
 def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
     if isinstance(ds, str):
-        ds = load_dataset(ds, *args, streaming=True, **kwargs)
+        kwargs["streaming"] = True
+        ds = load_dataset(ds, *args, **kwargs)
     if isinstance(ds, (DatasetDict, IterableDatasetDict)):
         return ds
     return {"": ds}

datachain/lib/listing.py CHANGED Viewed

@@ -1,103 +1,26 @@
-import asyncio
-from collections.abc import AsyncIterator, Iterator, Sequence
-from typing import Callable, Optional
+import posixpath
+from collections.abc import Iterator
+from datetime import datetime, timedelta, timezone
+from typing import TYPE_CHECKING, Callable, Optional
-from botocore.exceptions import ClientError
 from fsspec.asyn import get_loop
+from sqlalchemy.sql.expression import true
 from datachain.asyn import iter_over_async
 from datachain.client import Client
-from datachain.error import ClientError as DataChainClientError
 from datachain.lib.file import File
+from datachain.query.schema import Column
+from datachain.sql.functions import path as pathfunc
+from datachain.utils import uses_glob
-ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
-DELIMITER = "/"  # Path delimiter
-FETCH_WORKERS = 100
-async def _fetch_dir(client, prefix, result_queue) -> set[str]:
-    path = f"{client.name}/{prefix}"
-    infos = await client.ls_dir(path)
-    files = []
-    subdirs = set()
-    for info in infos:
-        full_path = info["name"]
-        subprefix = client.rel_path(full_path)
-        if prefix.strip(DELIMITER) == subprefix.strip(DELIMITER):
-            continue
-        if info["type"] == "directory":
-            subdirs.add(subprefix)
-        else:
-            files.append(client.info_to_file(info, subprefix))
-    if files:
-        await result_queue.put(files)
-    return subdirs
-async def _fetch(
-    client, start_prefix: str, result_queue: ResultQueue, fetch_workers
-) -> None:
-    loop = get_loop()
-    queue: asyncio.Queue[str] = asyncio.Queue()
-    queue.put_nowait(start_prefix)
-    async def process(queue) -> None:
-        while True:
-            prefix = await queue.get()
-            try:
-                subdirs = await _fetch_dir(client, prefix, result_queue)
-                for subdir in subdirs:
-                    queue.put_nowait(subdir)
-            except Exception:
-                while not queue.empty():
-                    queue.get_nowait()
-                    queue.task_done()
-                raise
-            finally:
-                queue.task_done()
-    try:
-        workers: list[asyncio.Task] = [
-            loop.create_task(process(queue)) for _ in range(fetch_workers)
-        ]
-        # Wait for all fetch tasks to complete
-        await queue.join()
-        # Stop the workers
-        excs = []
-        for worker in workers:
-            if worker.done() and (exc := worker.exception()):
-                excs.append(exc)
-            else:
-                worker.cancel()
-        if excs:
-            raise excs[0]
-    except ClientError as exc:
-        raise DataChainClientError(
-            exc.response.get("Error", {}).get("Message") or exc,
-            exc.response.get("Error", {}).get("Code"),
-        ) from exc
-    finally:
-        # This ensures the progress bar is closed before any exceptions are raised
-        result_queue.put_nowait(None)
-async def _scandir(client, prefix, fetch_workers) -> AsyncIterator:
-    """Recursively goes through dir tree and yields files"""
-    result_queue: ResultQueue = asyncio.Queue()
-    loop = get_loop()
-    main_task = loop.create_task(_fetch(client, prefix, result_queue, fetch_workers))
-    while (files := await result_queue.get()) is not None:
-        for f in files:
-            yield f
-    await main_task
-def list_bucket(uri: str, client_config=None, fetch_workers=FETCH_WORKERS) -> Callable:
+if TYPE_CHECKING:
+    from datachain.lib.dc import DataChain
+LISTING_TTL = 4 * 60 * 60  # cached listing lasts 4 hours
+LISTING_PREFIX = "lst__"  # listing datasets start with this name
+def list_bucket(uri: str, client_config=None) -> Callable:
     """
     Function that returns another generator function that yields File objects
     from bucket where each File represents one bucket entry.
@@ -106,6 +29,91 @@ def list_bucket(uri: str, client_config=None, fetch_workers=FETCH_WORKERS) -> Ca
     def list_func() -> Iterator[File]:
         config = client_config or {}
         client, path = Client.parse_url(uri, None, **config)  # type: ignore[arg-type]
-        yield from iter_over_async(_scandir(client, path, fetch_workers), get_loop())
+        for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
+            for entry in entries:
+                yield entry.to_file(client.uri)
     return list_func
+def ls(
+    dc: "DataChain",
+    path: str,
+    recursive: Optional[bool] = True,
+    object_name="file",
+):
+    """
+    Return files by some path from DataChain instance which contains bucket listing.
+    Path can have globs.
+    If recursive is set to False, only first level children will be returned by
+    specified path
+    """
+    def _file_c(name: str) -> Column:
+        return Column(f"{object_name}.{name}")
+    dc = dc.filter(_file_c("is_latest") == true())
+    if recursive:
+        if not path or path == "/":
+            # root of a bucket, returning all latest files from it
+            return dc
+        if not uses_glob(path):
+            # path is not glob, so it's pointing to some directory or a specific
+            # file and we are adding proper filter for it
+            return dc.filter(
+                (_file_c("path") == path)
+                | (_file_c("path").glob(path.rstrip("/") + "/*"))
+            )
+        # path has glob syntax so we are returning glob filter
+        return dc.filter(_file_c("path").glob(path))
+    # returning only first level children by path
+    return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
+def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
+    """
+    Parsing uri and returns listing dataset name, listing uri and listing path
+    """
+    client, path = Client.parse_url(uri, cache, **client_config)
+    # clean path without globs
+    lst_uri_path = (
+        posixpath.dirname(path) if uses_glob(path) or client.fs.isfile(uri) else path
+    )
+    lst_uri = f"{client.uri}/{lst_uri_path.lstrip('/')}"
+    ds_name = (
+        f"{LISTING_PREFIX}{client.uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
+    )
+    return ds_name, lst_uri, path
+def is_listing_dataset(name: str) -> bool:
+    """Returns True if it's special listing dataset"""
+    return name.startswith(LISTING_PREFIX)
+def listing_uri_from_name(dataset_name: str) -> str:
+    """Returns clean storage URI from listing dataset name"""
+    if not is_listing_dataset(dataset_name):
+        raise ValueError(f"Dataset {dataset_name} is not a listing")
+    return dataset_name.removeprefix(LISTING_PREFIX)
+def is_listing_expired(created_at: datetime) -> bool:
+    """Checks if listing has expired based on it's creation date"""
+    return datetime.now(timezone.utc) > created_at + timedelta(seconds=LISTING_TTL)
+def is_listing_subset(ds1_name: str, ds2_name: str) -> bool:
+    """
+    Checks if one listing contains another one by comparing corresponding dataset names
+    """
+    assert ds1_name.endswith("/")
+    assert ds2_name.endswith("/")
+    return ds2_name.startswith(ds1_name)

datachain/lib/listing_info.py ADDED Viewed

@@ -0,0 +1,32 @@
+from datetime import datetime, timedelta, timezone
+from typing import Optional
+from datachain.client import Client
+from datachain.lib.dataset_info import DatasetInfo
+from datachain.lib.listing import LISTING_PREFIX, LISTING_TTL
+class ListingInfo(DatasetInfo):
+    @property
+    def uri(self) -> str:
+        return self.name.removeprefix(LISTING_PREFIX)
+    @property
+    def storage_uri(self) -> str:
+        client, _ = Client.parse_url(self.uri, None)  # type: ignore[arg-type]
+        return client.uri
+    @property
+    def expires(self) -> Optional[datetime]:
+        if not self.finished_at:
+            return None
+        return self.finished_at + timedelta(seconds=LISTING_TTL)
+    @property
+    def is_expired(self) -> bool:
+        return datetime.now(timezone.utc) > self.expires if self.expires else False
+    @property
+    def last_inserted_at(self):
+        # TODO we need to add updated_at to dataset version or explicit last_inserted_at
+        raise NotImplementedError

datachain/lib/meta_formats.py CHANGED Viewed

@@ -54,10 +54,10 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
     try:
         with source_file.open() as fd:  # CSV can be larger than memory
             if data_type == "csv":
-                data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
-                data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
+                data_string += fd.readline().replace("\r", "")
+                data_string += fd.readline().replace("\r", "")
             elif data_type == "jsonl":
-                data_string = fd.readline().decode("utf-8", "ignore").replace("\r", "")
+                data_string = fd.readline().replace("\r", "")
             else:
                 data_string = fd.read()  # other meta must fit into RAM
     except OSError as e:
@@ -120,7 +120,7 @@ def read_meta(  # noqa: C901
         sys.stdout = captured_output
         try:
             chain = (
-                DataChain.from_storage(schema_from)
+                DataChain.from_storage(schema_from, type="text")
                 .limit(1)
                 .map(  # dummy column created (#1615)
                     meta_schema=lambda file: read_schema(

datachain/lib/signal_schema.py CHANGED Viewed

@@ -16,7 +16,6 @@ from typing import (
     get_origin,
 )
-import sqlalchemy as sa
 from pydantic import BaseModel, create_model
 from typing_extensions import Literal as LiteralEx
@@ -341,7 +340,7 @@ class SignalSchema:
         signals = [
             DEFAULT_DELIMITER.join(path)
             if not as_columns
-            else sa.Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
+            else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
             for path, _type, has_subtree, _ in self.get_flat_tree()
             if not has_subtree
         ]
@@ -415,6 +414,10 @@ class SignalSchema:
                 # renaming existing signal
                 del new_values[value.name]
                 new_values[name] = self.values[value.name]
+            elif name in self.values:
+                # changing the type of existing signal, e.g File -> ImageFile
+                del new_values[name]
+                new_values[name] = args_map[name]
             else:
                 # adding new signal
                 new_values.update(sql_to_python({name: value}))

datachain 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

Potentially problematic release.

datachain 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl