PyPI - datachain - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

datachain 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (29) hide show

datachain/asyn.py +20 -0
datachain/catalog/catalog.py +12 -1
datachain/catalog/loader.py +75 -50
datachain/client/azure.py +13 -0
datachain/client/gcs.py +12 -0
datachain/client/local.py +11 -0
datachain/client/s3.py +12 -0
datachain/data_storage/schema.py +22 -8
datachain/data_storage/sqlite.py +60 -14
datachain/data_storage/warehouse.py +17 -3
datachain/lib/arrow.py +1 -1
datachain/lib/convert/values_to_tuples.py +14 -8
datachain/lib/data_model.py +1 -0
datachain/lib/dc.py +52 -19
datachain/lib/listing.py +111 -0
datachain/lib/meta_formats.py +8 -2
datachain/node.py +1 -1
datachain/query/dataset.py +22 -12
datachain/query/schema.py +4 -0
datachain/query/session.py +9 -2
datachain/sql/default/base.py +3 -0
datachain/sql/sqlite/base.py +33 -4
datachain/sql/types.py +120 -11
{datachain-0.3.1.dist-info → datachain-0.3.3.dist-info}/METADATA +75 -87
{datachain-0.3.1.dist-info → datachain-0.3.3.dist-info}/RECORD +29 -28
{datachain-0.3.1.dist-info → datachain-0.3.3.dist-info}/WHEEL +1 -1
{datachain-0.3.1.dist-info → datachain-0.3.3.dist-info}/LICENSE +0 -0
{datachain-0.3.1.dist-info → datachain-0.3.3.dist-info}/entry_points.txt +0 -0
{datachain-0.3.1.dist-info → datachain-0.3.3.dist-info}/top_level.txt +0 -0

datachain/asyn.py CHANGED Viewed

@@ -224,3 +224,23 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
     async def _break_iteration(self) -> None:
         self.heap = []
         self._push_result(self._next_yield, None)
+def iter_over_async(ait, loop):
+    """Wrap an asynchronous iterator into a synchronous one"""
+    ait = ait.__aiter__()
+    # helper async fn that just gets the next element from the async iterator
+    async def get_next():
+        try:
+            obj = await ait.__anext__()
+            return False, obj
+        except StopAsyncIteration:
+            return True, None
+    # actual sync iterator
+    while True:
+        done, obj = asyncio.run_coroutine_threadsafe(get_next(), loop).result()
+        if done:
+            break
+        yield obj

datachain/catalog/catalog.py CHANGED Viewed

@@ -577,6 +577,7 @@ class Catalog:
         warehouse_ready_callback: Optional[
             Callable[["AbstractWarehouse"], None]
         ] = None,
+        in_memory: bool = False,
     ):
         datachain_dir = DataChainDir(cache=cache_dir, tmp=tmp_dir)
         datachain_dir.init()
@@ -590,6 +591,7 @@ class Catalog:
             "tmp_dir": tmp_dir,
         }
         self._warehouse_ready_callback = warehouse_ready_callback
+        self.in_memory = in_memory
     @cached_property
     def warehouse(self) -> "AbstractWarehouse":
@@ -1627,8 +1629,17 @@ class Catalog:
         version = self.get_dataset(dataset_name).get_version(dataset_version)
         file_signals_values = {}
+        file_schemas = {}
+        # TODO: To remove after we properly fix deserialization
+        for signal, type_name in version.feature_schema.items():
+            from datachain.lib.model_store import ModelStore
-        schema = SignalSchema.deserialize(version.feature_schema)
+            type_name_parsed, v = ModelStore.parse_name_version(type_name)
+            fr = ModelStore.get(type_name_parsed, v)
+            if fr and issubclass(fr, File):
+                file_schemas[signal] = type_name
+        schema = SignalSchema.deserialize(file_schemas)
         for file_signals in schema.get_signals(File):
             prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
             file_signals_values[file_signals] = {

datachain/catalog/loader.py CHANGED Viewed

@@ -28,8 +28,10 @@ WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
 DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
 DISTRIBUTED_ARG_PREFIX = "DATACHAIN_DISTRIBUTED_ARG_"
+IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
-def get_id_generator() -> "AbstractIDGenerator":
+def get_id_generator(in_memory: bool = False) -> "AbstractIDGenerator":
     id_generator_serialized = os.environ.get(ID_GENERATOR_SERIALIZED)
     if id_generator_serialized:
         id_generator_obj = deserialize(id_generator_serialized)
@@ -43,25 +45,31 @@ def get_id_generator() -> "AbstractIDGenerator":
     id_generator_import_path = os.environ.get(ID_GENERATOR_IMPORT_PATH)
     id_generator_arg_envs = get_envs_by_prefix(ID_GENERATOR_ARG_PREFIX)
     # Convert env variable names to keyword argument names by lowercasing them
-    id_generator_args = {k.lower(): v for k, v in id_generator_arg_envs.items()}
-    if id_generator_import_path:
-        # ID generator paths are specified as (for example):
-        # datachain.data_storage.SQLiteIDGenerator
-        if "." not in id_generator_import_path:
-            raise RuntimeError(
-                f"Invalid {ID_GENERATOR_IMPORT_PATH} import path:"
-                f"{id_generator_import_path}"
-            )
-        module_name, _, class_name = id_generator_import_path.rpartition(".")
-        id_generator = import_module(module_name)
-        id_generator_class = getattr(id_generator, class_name)
-    else:
-        id_generator_class = SQLiteIDGenerator
+    id_generator_args: dict[str, Any] = {
+        k.lower(): v for k, v in id_generator_arg_envs.items()
+    }
+    if not id_generator_import_path:
+        id_generator_args["in_memory"] = in_memory
+        return SQLiteIDGenerator(**id_generator_args)
+    if in_memory:
+        raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
+    # ID generator paths are specified as (for example):
+    # datachain.data_storage.SQLiteIDGenerator
+    if "." not in id_generator_import_path:
+        raise RuntimeError(
+            f"Invalid {ID_GENERATOR_IMPORT_PATH} import path:"
+            f"{id_generator_import_path}"
+        )
+    module_name, _, class_name = id_generator_import_path.rpartition(".")
+    id_generator = import_module(module_name)
+    id_generator_class = getattr(id_generator, class_name)
     return id_generator_class(**id_generator_args)
-def get_metastore(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractMetastore":
+def get_metastore(
+    id_generator: Optional["AbstractIDGenerator"], in_memory: bool = False
+) -> "AbstractMetastore":
     metastore_serialized = os.environ.get(METASTORE_SERIALIZED)
     if metastore_serialized:
         metastore_obj = deserialize(metastore_serialized)
@@ -78,24 +86,32 @@ def get_metastore(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractMet
     metastore_import_path = os.environ.get(METASTORE_IMPORT_PATH)
     metastore_arg_envs = get_envs_by_prefix(METASTORE_ARG_PREFIX)
     # Convert env variable names to keyword argument names by lowercasing them
-    metastore_args = {k.lower(): v for k, v in metastore_arg_envs.items()}
-    if metastore_import_path:
-        # Metastore paths are specified as (for example):
-        # datachain.data_storage.SQLiteMetastore
-        if "." not in metastore_import_path:
-            raise RuntimeError(
-                f"Invalid {METASTORE_IMPORT_PATH} import path: {metastore_import_path}"
-            )
-        module_name, _, class_name = metastore_import_path.rpartition(".")
-        metastore = import_module(module_name)
-        metastore_class = getattr(metastore, class_name)
-    else:
-        metastore_class = SQLiteMetastore
+    metastore_args: dict[str, Any] = {
+        k.lower(): v for k, v in metastore_arg_envs.items()
+    }
+    if not metastore_import_path:
+        if not isinstance(id_generator, SQLiteIDGenerator):
+            raise ValueError("SQLiteMetastore can only be used with SQLiteIDGenerator")
+        metastore_args["in_memory"] = in_memory
+        return SQLiteMetastore(id_generator, **metastore_args)
+    if in_memory:
+        raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
+    # Metastore paths are specified as (for example):
+    # datachain.data_storage.SQLiteMetastore
+    if "." not in metastore_import_path:
+        raise RuntimeError(
+            f"Invalid {METASTORE_IMPORT_PATH} import path: {metastore_import_path}"
+        )
+    module_name, _, class_name = metastore_import_path.rpartition(".")
+    metastore = import_module(module_name)
+    metastore_class = getattr(metastore, class_name)
     return metastore_class(id_generator, **metastore_args)
-def get_warehouse(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractWarehouse":
+def get_warehouse(
+    id_generator: Optional["AbstractIDGenerator"], in_memory: bool = False
+) -> "AbstractWarehouse":
     warehouse_serialized = os.environ.get(WAREHOUSE_SERIALIZED)
     if warehouse_serialized:
         warehouse_obj = deserialize(warehouse_serialized)
@@ -112,20 +128,26 @@ def get_warehouse(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractWar
     warehouse_import_path = os.environ.get(WAREHOUSE_IMPORT_PATH)
     warehouse_arg_envs = get_envs_by_prefix(WAREHOUSE_ARG_PREFIX)
     # Convert env variable names to keyword argument names by lowercasing them
-    warehouse_args = {k.lower(): v for k, v in warehouse_arg_envs.items()}
-    if warehouse_import_path:
-        # Warehouse paths are specified as (for example):
-        # datachain.data_storage.SQLiteWarehouse
-        if "." not in warehouse_import_path:
-            raise RuntimeError(
-                f"Invalid {WAREHOUSE_IMPORT_PATH} import path: {warehouse_import_path}"
-            )
-        module_name, _, class_name = warehouse_import_path.rpartition(".")
-        warehouse = import_module(module_name)
-        warehouse_class = getattr(warehouse, class_name)
-    else:
-        warehouse_class = SQLiteWarehouse
+    warehouse_args: dict[str, Any] = {
+        k.lower(): v for k, v in warehouse_arg_envs.items()
+    }
+    if not warehouse_import_path:
+        if not isinstance(id_generator, SQLiteIDGenerator):
+            raise ValueError("SQLiteWarehouse can only be used with SQLiteIDGenerator")
+        warehouse_args["in_memory"] = in_memory
+        return SQLiteWarehouse(id_generator, **warehouse_args)
+    if in_memory:
+        raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
+    # Warehouse paths are specified as (for example):
+    # datachain.data_storage.SQLiteWarehouse
+    if "." not in warehouse_import_path:
+        raise RuntimeError(
+            f"Invalid {WAREHOUSE_IMPORT_PATH} import path: {warehouse_import_path}"
+        )
+    module_name, _, class_name = warehouse_import_path.rpartition(".")
+    warehouse = import_module(module_name)
+    warehouse_class = getattr(warehouse, class_name)
     return warehouse_class(id_generator, **warehouse_args)
@@ -152,7 +174,9 @@ def get_distributed_class(**kwargs):
     return distributed_class(**distributed_args | kwargs)
-def get_catalog(client_config: Optional[dict[str, Any]] = None) -> Catalog:
+def get_catalog(
+    client_config: Optional[dict[str, Any]] = None, in_memory: bool = False
+) -> Catalog:
     """
     Function that creates Catalog instance with appropriate metastore
     and warehouse classes. Metastore class can be provided with env variable
@@ -164,10 +188,11 @@ def get_catalog(client_config: Optional[dict[str, Any]] = None) -> Catalog:
     and name of variable after, e.g. if it accepts team_id as kwargs
     we can provide DATACHAIN_METASTORE_ARG_TEAM_ID=12345 env variable.
     """
-    id_generator = get_id_generator()
+    id_generator = get_id_generator(in_memory=in_memory)
     return Catalog(
         id_generator=id_generator,
-        metastore=get_metastore(id_generator),
-        warehouse=get_warehouse(id_generator),
+        metastore=get_metastore(id_generator, in_memory=in_memory),
+        warehouse=get_warehouse(id_generator, in_memory=in_memory),
         client_config=client_config,
+        in_memory=in_memory,
     )

datachain/client/azure.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Any
 from adlfs import AzureBlobFileSystem
 from tqdm import tqdm
+from datachain.lib.file import File
 from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -24,6 +25,18 @@ class AzureClient(Client):
             size=v.get("size", ""),
         )
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        version_id = v.get("version_id")
+        return File(
+            source=self.uri,
+            path=path,
+            etag=v.get("etag", "").strip('"'),
+            version=version_id or "",
+            is_latest=version_id is None or bool(v.get("is_current_version")),
+            last_modified=v["last_modified"],
+            size=v.get("size", ""),
+        )
     async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
         prefix = start_prefix
         if prefix:

datachain/client/gcs.py CHANGED Viewed

@@ -9,6 +9,7 @@ from dateutil.parser import isoparse
 from gcsfs import GCSFileSystem
 from tqdm import tqdm
+from datachain.lib.file import File
 from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -120,3 +121,14 @@ class GCSClient(Client):
             last_modified=self.parse_timestamp(v["updated"]),
             size=v.get("size", ""),
         )
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        return File(
+            source=self.uri,
+            path=path,
+            etag=v.get("etag", ""),
+            version=v.get("generation", ""),
+            is_latest=not v.get("timeDeleted"),
+            last_modified=self.parse_timestamp(v["updated"]),
+            size=v.get("size", ""),
+        )

datachain/client/local.py CHANGED Viewed

@@ -7,6 +7,7 @@ from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
+from datachain.lib.file import File
 from datachain.node import Entry
 from datachain.storage import StorageURI
@@ -144,6 +145,16 @@ class FileClient(Client):
             size=v.get("size", ""),
         )
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        return File(
+            source=self.uri,
+            path=path,
+            size=v.get("size", ""),
+            etag=v["mtime"].hex(),
+            is_latest=True,
+            last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
+        )
     def fetch_nodes(
         self,
         nodes,

datachain/client/s3.py CHANGED Viewed

@@ -5,6 +5,7 @@ from botocore.exceptions import NoCredentialsError
 from s3fs import S3FileSystem
 from tqdm import tqdm
+from datachain.lib.file import File
 from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -167,3 +168,14 @@ class ClientS3(Client):
             owner_name=v.get("Owner", {}).get("DisplayName", ""),
             owner_id=v.get("Owner", {}).get("ID", ""),
         )
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        return File(
+            source=self.uri,
+            path=path,
+            size=v["size"],
+            version=ClientS3.clean_s3_version(v.get("VersionId", "")),
+            etag=v.get("ETag", "").strip('"'),
+            is_latest=v.get("IsLatest", True),
+            last_modified=v.get("LastModified", ""),
+        )

datachain/data_storage/schema.py CHANGED Viewed

@@ -67,7 +67,11 @@ def convert_rows_custom_column_types(
     for row in rows:
         row_list = list(row)
         for idx, t in custom_columns_types:
-            row_list[idx] = t.on_read_convert(row_list[idx], dialect)
+            row_list[idx] = (
+                t.default_value(dialect)
+                if row_list[idx] is None
+                else t.on_read_convert(row_list[idx], dialect)
+            )
         yield tuple(row_list)
@@ -136,7 +140,15 @@ class DataTable:
         self.column_types: dict[str, SQLType] = column_types or {}
     @staticmethod
-    def copy_column(column: sa.Column):
+    def copy_column(
+        column: sa.Column,
+        primary_key: Optional[bool] = None,
+        index: Optional[bool] = None,
+        nullable: Optional[bool] = None,
+        default: Optional[Any] = None,
+        server_default: Optional[Any] = None,
+        unique: Optional[bool] = None,
+    ) -> sa.Column:
         """
         Copy a sqlalchemy Column object intended for use as a signal column.
@@ -150,12 +162,14 @@ class DataTable:
         return sa.Column(
             column.name,
             column.type,
-            primary_key=column.primary_key,
-            index=column.index,
-            nullable=column.nullable,
-            default=column.default,
-            server_default=column.server_default,
-            unique=column.unique,
+            primary_key=primary_key if primary_key is not None else column.primary_key,
+            index=index if index is not None else column.index,
+            nullable=nullable if nullable is not None else column.nullable,
+            default=default if default is not None else column.default,
+            server_default=(
+                server_default if server_default is not None else column.server_default
+            ),
+            unique=unique if unique is not None else column.unique,
         )
     @classmethod

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -20,6 +20,8 @@ from sqlalchemy.dialects import sqlite
 from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
 from sqlalchemy.sql import func
 from sqlalchemy.sql.expression import bindparam, cast
+from sqlalchemy.sql.selectable import Select
+from tqdm import tqdm
 import datachain.sql.sqlite
 from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
@@ -35,14 +37,13 @@ from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_diale
 from datachain.sql.sqlite.base import load_usearch_extension
 from datachain.sql.types import SQLType
 from datachain.storage import StorageURI
-from datachain.utils import DataChainDir
+from datachain.utils import DataChainDir, batched_it
 if TYPE_CHECKING:
     from sqlalchemy.dialects.sqlite import Insert
     from sqlalchemy.engine.base import Engine
     from sqlalchemy.schema import SchemaItem
-    from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
-    from sqlalchemy.sql.selectable import Select
+    from sqlalchemy.sql.elements import ColumnElement
     from sqlalchemy.types import TypeEngine
@@ -54,8 +55,6 @@ RETRY_FACTOR = 2
 DETECT_TYPES = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
-Column = Union[str, "ColumnClause[Any]", "TextClause"]
 datachain.sql.sqlite.setup()
 quote_schema = sqlite_dialect.identifier_preparer.quote_schema
@@ -82,6 +81,17 @@ def retry_sqlite_locks(func):
     return wrapper
+def get_db_file_in_memory(
+    db_file: Optional[str] = None, in_memory: bool = False
+) -> Optional[str]:
+    """Get in-memory db_file and check that conflicting arguments are not provided."""
+    if in_memory:
+        if db_file and db_file != ":memory:":
+            raise RuntimeError("A db_file cannot be specified if in_memory is True")
+        db_file = ":memory:"
+    return db_file
 class SQLiteDatabaseEngine(DatabaseEngine):
     dialect = sqlite_dialect
@@ -122,6 +132,11 @@ class SQLiteDatabaseEngine(DatabaseEngine):
             engine = sqlalchemy.create_engine(
                 "sqlite+pysqlite:///", creator=lambda: db, future=True
             )
+            # ensure we run SA on_connect init (e.g it registers regexp function),
+            # also makes sure that it's consistent. Otherwise in some cases it
+            # seems we are getting different results if engine object is used in a
+            # different thread first and enine is not used in the Main thread.
+            engine.connect().close()
             db.isolation_level = None  # Use autocommit mode
             db.execute("PRAGMA foreign_keys = ON")
@@ -260,7 +275,10 @@ class SQLiteIDGenerator(AbstractDBIDGenerator):
         table_prefix: Optional[str] = None,
         skip_db_init: bool = False,
         db_file: Optional[str] = None,
+        in_memory: bool = False,
     ):
+        db_file = get_db_file_in_memory(db_file, in_memory)
         db = db or SQLiteDatabaseEngine.from_db_file(db_file)
         super().__init__(db, table_prefix, skip_db_init)
@@ -378,6 +396,7 @@ class SQLiteMetastore(AbstractDBMetastore):
         partial_id: Optional[int] = None,
         db: Optional["SQLiteDatabaseEngine"] = None,
         db_file: Optional[str] = None,
+        in_memory: bool = False,
     ):
         self.schema: DefaultSchema = DefaultSchema()
         super().__init__(id_generator, uri, partial_id)
@@ -386,6 +405,8 @@ class SQLiteMetastore(AbstractDBMetastore):
         # foreign keys
         self.default_table_names: list[str] = []
+        db_file = get_db_file_in_memory(db_file, in_memory)
         self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
         self._init_tables()
@@ -550,10 +571,13 @@ class SQLiteWarehouse(AbstractWarehouse):
         id_generator: "SQLiteIDGenerator",
         db: Optional["SQLiteDatabaseEngine"] = None,
         db_file: Optional[str] = None,
+        in_memory: bool = False,
     ):
         self.schema: DefaultSchema = DefaultSchema()
         super().__init__(id_generator)
+        db_file = get_db_file_in_memory(db_file, in_memory)
         self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
     def __exit__(self, exc_type, exc_value, traceback) -> None:
@@ -626,9 +650,7 @@ class SQLiteWarehouse(AbstractWarehouse):
         self.db.create_table(table, if_not_exists=if_not_exists)
         return table
-    def dataset_rows_select(
-        self, select_query: sqlalchemy.sql.selectable.Select, **kwargs
-    ):
+    def dataset_rows_select(self, select_query: Select, **kwargs):
         rows = self.db.execute(select_query, **kwargs)
         yield from convert_rows_custom_column_types(
             select_query.selected_columns, rows, sqlite_dialect
@@ -746,6 +768,34 @@ class SQLiteWarehouse(AbstractWarehouse):
     ) -> list[str]:
         raise NotImplementedError("Exporting dataset table not implemented for SQLite")
+    def copy_table(
+        self,
+        table: Table,
+        query: Select,
+        progress_cb: Optional[Callable[[int], None]] = None,
+    ) -> None:
+        if "sys__id" in query.selected_columns:
+            col_id = query.selected_columns.sys__id
+        else:
+            col_id = sqlalchemy.column("sys__id")
+        select_ids = query.with_only_columns(col_id)
+        ids = self.db.execute(select_ids).fetchall()
+        select_q = query.with_only_columns(
+            *[c for c in query.selected_columns if c.name != "sys__id"]
+        )
+        for batch in batched_it(ids, 10_000):
+            batch_ids = [row[0] for row in batch]
+            select_q._where_criteria = (col_id.in_(batch_ids),)
+            q = table.insert().from_select(list(select_q.selected_columns), select_q)
+            self.db.execute(q)
+            if progress_cb:
+                progress_cb(len(batch_ids))
     def create_pre_udf_table(self, query: "Select") -> "Table":
         """
         Create a temporary table from a query for use in a UDF.
@@ -757,11 +807,7 @@ class SQLiteWarehouse(AbstractWarehouse):
         ]
         table = self.create_udf_table(columns)
-        select_q = query.with_only_columns(
-            *[c for c in query.selected_columns if c.name != "sys__id"]
-        )
-        self.db.execute(
-            table.insert().from_select(list(select_q.selected_columns), select_q)
-        )
+        with tqdm(desc="Preparing", unit=" rows") as pbar:
+            self.copy_table(table, query, progress_cb=pbar.update)
         return table

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -6,7 +6,7 @@ import random
 import string
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Iterable, Iterator, Sequence
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 from urllib.parse import urlparse
 import attrs
@@ -14,6 +14,7 @@ import sqlalchemy as sa
 from sqlalchemy import Table, case, select
 from sqlalchemy.sql import func
 from sqlalchemy.sql.expression import true
+from tqdm import tqdm
 from datachain.client import Client
 from datachain.data_storage.serializer import Serializable
@@ -901,6 +902,17 @@ class AbstractWarehouse(ABC, Serializable):
         self.db.create_table(tbl, if_not_exists=True)
         return tbl
+    @abstractmethod
+    def copy_table(
+        self,
+        table: Table,
+        query: "Select",
+        progress_cb: Optional[Callable[[int], None]] = None,
+    ) -> None:
+        """
+        Copy the results of a query into a table.
+        """
     @abstractmethod
     def create_pre_udf_table(self, query: "Select") -> "Table":
         """
@@ -928,8 +940,10 @@ class AbstractWarehouse(ABC, Serializable):
         This should be implemented to ensure that the provided tables
         are cleaned up as soon as they are no longer needed.
         """
-        for name in names:
-            self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
+        with tqdm(desc="Cleanup", unit=" tables") as pbar:
+            for name in names:
+                self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
+                pbar.update(1)
     def changed_query(
         self,

datachain/lib/arrow.py CHANGED Viewed

@@ -122,7 +122,7 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
     if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
         return str
     if pa.types.is_list(col_type):
-        return list[_arrow_type_mapper(col_type.value_type)]  # type: ignore[misc]
+        return list[_arrow_type_mapper(col_type.value_type)]  # type: ignore[return-value, misc]
     if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):

datachain/lib/convert/values_to_tuples.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from collections.abc import Sequence
 from typing import Any, Union
-from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
+from datachain.lib.data_model import (
+    DataType,
+    DataTypeNames,
+    DataValuesType,
+    is_chain_type,
+)
 from datachain.lib.utils import DataChainParamsError
@@ -15,7 +20,7 @@ class ValuesToTupleError(DataChainParamsError):
 def values_to_tuples(  # noqa: C901, PLR0912
     ds_name: str = "",
     output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
-    **fr_map,
+    **fr_map: Sequence[DataValuesType],
 ) -> tuple[Any, Any, Any]:
     if output:
         if not isinstance(output, (Sequence, str, dict)):
@@ -47,10 +52,10 @@ def values_to_tuples(  # noqa: C901, PLR0912
                 f" number of signals '{len(fr_map)}'",
             )
-    types_map = {}
+    types_map: dict[str, type] = {}
     length = -1
     for k, v in fr_map.items():
-        if not isinstance(v, Sequence) or isinstance(v, str):
+        if not isinstance(v, Sequence) or isinstance(v, str):  # type: ignore[unreachable]
             raise ValuesToTupleError(ds_name, f"signals '{k}' is not a sequence")
         len_ = len(v)
@@ -64,15 +69,16 @@ def values_to_tuples(  # noqa: C901, PLR0912
             if len_ == 0:
                 raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")
-            typ = type(v[0])
+            first_element = next(iter(v))
+            typ = type(first_element)
             if not is_chain_type(typ):
                 raise ValuesToTupleError(
                     ds_name,
                     f"signal '{k}' has unsupported type '{typ.__name__}'."
                     f" Please use DataModel types: {DataTypeNames}",
                 )
-            if typ is list:
-                types_map[k] = list[type(v[0][0])]  # type: ignore[misc]
+            if isinstance(first_element, list):
+                types_map[k] = list[type(first_element[0])]  # type: ignore[assignment, misc]
             else:
                 types_map[k] = typ
@@ -98,7 +104,7 @@ def values_to_tuples(  # noqa: C901, PLR0912
     if len(output) > 1:  # type: ignore[arg-type]
         tuple_type = tuple(output_types)
         res_type = tuple[tuple_type]  # type: ignore[valid-type]
-        res_values = list(zip(*fr_map.values()))
+        res_values: Sequence[Any] = list(zip(*fr_map.values()))
     else:
         res_type = output_types[0]  # type: ignore[misc]
         res_values = next(iter(fr_map.values()))

datachain 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

Potentially problematic release.

datachain 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl