PyPI - datachain - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

datachain 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (24) hide show

datachain/asyn.py +20 -0
datachain/catalog/catalog.py +2 -0
datachain/catalog/loader.py +75 -50
datachain/client/azure.py +13 -0
datachain/client/gcs.py +12 -0
datachain/client/local.py +11 -0
datachain/client/s3.py +12 -0
datachain/data_storage/sqlite.py +55 -14
datachain/data_storage/warehouse.py +17 -3
datachain/lib/arrow.py +1 -1
datachain/lib/convert/values_to_tuples.py +14 -8
datachain/lib/data_model.py +1 -0
datachain/lib/dc.py +25 -6
datachain/lib/listing.py +111 -0
datachain/query/dataset.py +22 -12
datachain/query/session.py +9 -2
datachain/sql/functions/string.py +12 -0
datachain/sql/sqlite/base.py +42 -4
{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/METADATA +2 -2
{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/RECORD +24 -23
{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/WHEEL +1 -1
{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/LICENSE +0 -0
{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/entry_points.txt +0 -0
{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/top_level.txt +0 -0

datachain/asyn.py CHANGED Viewed

@@ -224,3 +224,23 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
     async def _break_iteration(self) -> None:
         self.heap = []
         self._push_result(self._next_yield, None)
+def iter_over_async(ait, loop):
+    """Wrap an asynchronous iterator into a synchronous one"""
+    ait = ait.__aiter__()
+    # helper async fn that just gets the next element from the async iterator
+    async def get_next():
+        try:
+            obj = await ait.__anext__()
+            return False, obj
+        except StopAsyncIteration:
+            return True, None
+    # actual sync iterator
+    while True:
+        done, obj = asyncio.run_coroutine_threadsafe(get_next(), loop).result()
+        if done:
+            break
+        yield obj

datachain/catalog/catalog.py CHANGED Viewed

@@ -577,6 +577,7 @@ class Catalog:
         warehouse_ready_callback: Optional[
             Callable[["AbstractWarehouse"], None]
         ] = None,
+        in_memory: bool = False,
     ):
         datachain_dir = DataChainDir(cache=cache_dir, tmp=tmp_dir)
         datachain_dir.init()
@@ -590,6 +591,7 @@ class Catalog:
             "tmp_dir": tmp_dir,
         }
         self._warehouse_ready_callback = warehouse_ready_callback
+        self.in_memory = in_memory
     @cached_property
     def warehouse(self) -> "AbstractWarehouse":

datachain/catalog/loader.py CHANGED Viewed

@@ -28,8 +28,10 @@ WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
 DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
 DISTRIBUTED_ARG_PREFIX = "DATACHAIN_DISTRIBUTED_ARG_"
+IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
-def get_id_generator() -> "AbstractIDGenerator":
+def get_id_generator(in_memory: bool = False) -> "AbstractIDGenerator":
     id_generator_serialized = os.environ.get(ID_GENERATOR_SERIALIZED)
     if id_generator_serialized:
         id_generator_obj = deserialize(id_generator_serialized)
@@ -43,25 +45,31 @@ def get_id_generator() -> "AbstractIDGenerator":
     id_generator_import_path = os.environ.get(ID_GENERATOR_IMPORT_PATH)
     id_generator_arg_envs = get_envs_by_prefix(ID_GENERATOR_ARG_PREFIX)
     # Convert env variable names to keyword argument names by lowercasing them
-    id_generator_args = {k.lower(): v for k, v in id_generator_arg_envs.items()}
-    if id_generator_import_path:
-        # ID generator paths are specified as (for example):
-        # datachain.data_storage.SQLiteIDGenerator
-        if "." not in id_generator_import_path:
-            raise RuntimeError(
-                f"Invalid {ID_GENERATOR_IMPORT_PATH} import path:"
-                f"{id_generator_import_path}"
-            )
-        module_name, _, class_name = id_generator_import_path.rpartition(".")
-        id_generator = import_module(module_name)
-        id_generator_class = getattr(id_generator, class_name)
-    else:
-        id_generator_class = SQLiteIDGenerator
+    id_generator_args: dict[str, Any] = {
+        k.lower(): v for k, v in id_generator_arg_envs.items()
+    }
+    if not id_generator_import_path:
+        id_generator_args["in_memory"] = in_memory
+        return SQLiteIDGenerator(**id_generator_args)
+    if in_memory:
+        raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
+    # ID generator paths are specified as (for example):
+    # datachain.data_storage.SQLiteIDGenerator
+    if "." not in id_generator_import_path:
+        raise RuntimeError(
+            f"Invalid {ID_GENERATOR_IMPORT_PATH} import path:"
+            f"{id_generator_import_path}"
+        )
+    module_name, _, class_name = id_generator_import_path.rpartition(".")
+    id_generator = import_module(module_name)
+    id_generator_class = getattr(id_generator, class_name)
     return id_generator_class(**id_generator_args)
-def get_metastore(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractMetastore":
+def get_metastore(
+    id_generator: Optional["AbstractIDGenerator"], in_memory: bool = False
+) -> "AbstractMetastore":
     metastore_serialized = os.environ.get(METASTORE_SERIALIZED)
     if metastore_serialized:
         metastore_obj = deserialize(metastore_serialized)
@@ -78,24 +86,32 @@ def get_metastore(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractMet
     metastore_import_path = os.environ.get(METASTORE_IMPORT_PATH)
     metastore_arg_envs = get_envs_by_prefix(METASTORE_ARG_PREFIX)
     # Convert env variable names to keyword argument names by lowercasing them
-    metastore_args = {k.lower(): v for k, v in metastore_arg_envs.items()}
-    if metastore_import_path:
-        # Metastore paths are specified as (for example):
-        # datachain.data_storage.SQLiteMetastore
-        if "." not in metastore_import_path:
-            raise RuntimeError(
-                f"Invalid {METASTORE_IMPORT_PATH} import path: {metastore_import_path}"
-            )
-        module_name, _, class_name = metastore_import_path.rpartition(".")
-        metastore = import_module(module_name)
-        metastore_class = getattr(metastore, class_name)
-    else:
-        metastore_class = SQLiteMetastore
+    metastore_args: dict[str, Any] = {
+        k.lower(): v for k, v in metastore_arg_envs.items()
+    }
+    if not metastore_import_path:
+        if not isinstance(id_generator, SQLiteIDGenerator):
+            raise ValueError("SQLiteMetastore can only be used with SQLiteIDGenerator")
+        metastore_args["in_memory"] = in_memory
+        return SQLiteMetastore(id_generator, **metastore_args)
+    if in_memory:
+        raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
+    # Metastore paths are specified as (for example):
+    # datachain.data_storage.SQLiteMetastore
+    if "." not in metastore_import_path:
+        raise RuntimeError(
+            f"Invalid {METASTORE_IMPORT_PATH} import path: {metastore_import_path}"
+        )
+    module_name, _, class_name = metastore_import_path.rpartition(".")
+    metastore = import_module(module_name)
+    metastore_class = getattr(metastore, class_name)
     return metastore_class(id_generator, **metastore_args)
-def get_warehouse(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractWarehouse":
+def get_warehouse(
+    id_generator: Optional["AbstractIDGenerator"], in_memory: bool = False
+) -> "AbstractWarehouse":
     warehouse_serialized = os.environ.get(WAREHOUSE_SERIALIZED)
     if warehouse_serialized:
         warehouse_obj = deserialize(warehouse_serialized)
@@ -112,20 +128,26 @@ def get_warehouse(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractWar
     warehouse_import_path = os.environ.get(WAREHOUSE_IMPORT_PATH)
     warehouse_arg_envs = get_envs_by_prefix(WAREHOUSE_ARG_PREFIX)
     # Convert env variable names to keyword argument names by lowercasing them
-    warehouse_args = {k.lower(): v for k, v in warehouse_arg_envs.items()}
-    if warehouse_import_path:
-        # Warehouse paths are specified as (for example):
-        # datachain.data_storage.SQLiteWarehouse
-        if "." not in warehouse_import_path:
-            raise RuntimeError(
-                f"Invalid {WAREHOUSE_IMPORT_PATH} import path: {warehouse_import_path}"
-            )
-        module_name, _, class_name = warehouse_import_path.rpartition(".")
-        warehouse = import_module(module_name)
-        warehouse_class = getattr(warehouse, class_name)
-    else:
-        warehouse_class = SQLiteWarehouse
+    warehouse_args: dict[str, Any] = {
+        k.lower(): v for k, v in warehouse_arg_envs.items()
+    }
+    if not warehouse_import_path:
+        if not isinstance(id_generator, SQLiteIDGenerator):
+            raise ValueError("SQLiteWarehouse can only be used with SQLiteIDGenerator")
+        warehouse_args["in_memory"] = in_memory
+        return SQLiteWarehouse(id_generator, **warehouse_args)
+    if in_memory:
+        raise RuntimeError(IN_MEMORY_ERROR_MESSAGE)
+    # Warehouse paths are specified as (for example):
+    # datachain.data_storage.SQLiteWarehouse
+    if "." not in warehouse_import_path:
+        raise RuntimeError(
+            f"Invalid {WAREHOUSE_IMPORT_PATH} import path: {warehouse_import_path}"
+        )
+    module_name, _, class_name = warehouse_import_path.rpartition(".")
+    warehouse = import_module(module_name)
+    warehouse_class = getattr(warehouse, class_name)
     return warehouse_class(id_generator, **warehouse_args)
@@ -152,7 +174,9 @@ def get_distributed_class(**kwargs):
     return distributed_class(**distributed_args | kwargs)
-def get_catalog(client_config: Optional[dict[str, Any]] = None) -> Catalog:
+def get_catalog(
+    client_config: Optional[dict[str, Any]] = None, in_memory: bool = False
+) -> Catalog:
     """
     Function that creates Catalog instance with appropriate metastore
     and warehouse classes. Metastore class can be provided with env variable
@@ -164,10 +188,11 @@ def get_catalog(client_config: Optional[dict[str, Any]] = None) -> Catalog:
     and name of variable after, e.g. if it accepts team_id as kwargs
     we can provide DATACHAIN_METASTORE_ARG_TEAM_ID=12345 env variable.
     """
-    id_generator = get_id_generator()
+    id_generator = get_id_generator(in_memory=in_memory)
     return Catalog(
         id_generator=id_generator,
-        metastore=get_metastore(id_generator),
-        warehouse=get_warehouse(id_generator),
+        metastore=get_metastore(id_generator, in_memory=in_memory),
+        warehouse=get_warehouse(id_generator, in_memory=in_memory),
         client_config=client_config,
+        in_memory=in_memory,
     )

datachain/client/azure.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Any
 from adlfs import AzureBlobFileSystem
 from tqdm import tqdm
+from datachain.lib.file import File
 from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -24,6 +25,18 @@ class AzureClient(Client):
             size=v.get("size", ""),
         )
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        version_id = v.get("version_id")
+        return File(
+            source=self.uri,
+            path=path,
+            etag=v.get("etag", "").strip('"'),
+            version=version_id or "",
+            is_latest=version_id is None or bool(v.get("is_current_version")),
+            last_modified=v["last_modified"],
+            size=v.get("size", ""),
+        )
     async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
         prefix = start_prefix
         if prefix:

datachain/client/gcs.py CHANGED Viewed

@@ -9,6 +9,7 @@ from dateutil.parser import isoparse
 from gcsfs import GCSFileSystem
 from tqdm import tqdm
+from datachain.lib.file import File
 from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -120,3 +121,14 @@ class GCSClient(Client):
             last_modified=self.parse_timestamp(v["updated"]),
             size=v.get("size", ""),
         )
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        return File(
+            source=self.uri,
+            path=path,
+            etag=v.get("etag", ""),
+            version=v.get("generation", ""),
+            is_latest=not v.get("timeDeleted"),
+            last_modified=self.parse_timestamp(v["updated"]),
+            size=v.get("size", ""),
+        )

datachain/client/local.py CHANGED Viewed

@@ -7,6 +7,7 @@ from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
+from datachain.lib.file import File
 from datachain.node import Entry
 from datachain.storage import StorageURI
@@ -144,6 +145,16 @@ class FileClient(Client):
             size=v.get("size", ""),
         )
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        return File(
+            source=self.uri,
+            path=path,
+            size=v.get("size", ""),
+            etag=v["mtime"].hex(),
+            is_latest=True,
+            last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
+        )
     def fetch_nodes(
         self,
         nodes,

datachain/client/s3.py CHANGED Viewed

@@ -5,6 +5,7 @@ from botocore.exceptions import NoCredentialsError
 from s3fs import S3FileSystem
 from tqdm import tqdm
+from datachain.lib.file import File
 from datachain.node import Entry
 from .fsspec import DELIMITER, Client, ResultQueue
@@ -167,3 +168,14 @@ class ClientS3(Client):
             owner_name=v.get("Owner", {}).get("DisplayName", ""),
             owner_id=v.get("Owner", {}).get("ID", ""),
         )
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        return File(
+            source=self.uri,
+            path=path,
+            size=v["size"],
+            version=ClientS3.clean_s3_version(v.get("VersionId", "")),
+            etag=v.get("ETag", "").strip('"'),
+            is_latest=v.get("IsLatest", True),
+            last_modified=v.get("LastModified", ""),
+        )

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -20,6 +20,8 @@ from sqlalchemy.dialects import sqlite
 from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
 from sqlalchemy.sql import func
 from sqlalchemy.sql.expression import bindparam, cast
+from sqlalchemy.sql.selectable import Select
+from tqdm import tqdm
 import datachain.sql.sqlite
 from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
@@ -35,14 +37,13 @@ from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_diale
 from datachain.sql.sqlite.base import load_usearch_extension
 from datachain.sql.types import SQLType
 from datachain.storage import StorageURI
-from datachain.utils import DataChainDir
+from datachain.utils import DataChainDir, batched_it
 if TYPE_CHECKING:
     from sqlalchemy.dialects.sqlite import Insert
     from sqlalchemy.engine.base import Engine
     from sqlalchemy.schema import SchemaItem
-    from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
-    from sqlalchemy.sql.selectable import Select
+    from sqlalchemy.sql.elements import ColumnElement
     from sqlalchemy.types import TypeEngine
@@ -54,8 +55,6 @@ RETRY_FACTOR = 2
 DETECT_TYPES = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
-Column = Union[str, "ColumnClause[Any]", "TextClause"]
 datachain.sql.sqlite.setup()
 quote_schema = sqlite_dialect.identifier_preparer.quote_schema
@@ -82,6 +81,17 @@ def retry_sqlite_locks(func):
     return wrapper
+def get_db_file_in_memory(
+    db_file: Optional[str] = None, in_memory: bool = False
+) -> Optional[str]:
+    """Get in-memory db_file and check that conflicting arguments are not provided."""
+    if in_memory:
+        if db_file and db_file != ":memory:":
+            raise RuntimeError("A db_file cannot be specified if in_memory is True")
+        db_file = ":memory:"
+    return db_file
 class SQLiteDatabaseEngine(DatabaseEngine):
     dialect = sqlite_dialect
@@ -265,7 +275,10 @@ class SQLiteIDGenerator(AbstractDBIDGenerator):
         table_prefix: Optional[str] = None,
         skip_db_init: bool = False,
         db_file: Optional[str] = None,
+        in_memory: bool = False,
     ):
+        db_file = get_db_file_in_memory(db_file, in_memory)
         db = db or SQLiteDatabaseEngine.from_db_file(db_file)
         super().__init__(db, table_prefix, skip_db_init)
@@ -383,6 +396,7 @@ class SQLiteMetastore(AbstractDBMetastore):
         partial_id: Optional[int] = None,
         db: Optional["SQLiteDatabaseEngine"] = None,
         db_file: Optional[str] = None,
+        in_memory: bool = False,
     ):
         self.schema: DefaultSchema = DefaultSchema()
         super().__init__(id_generator, uri, partial_id)
@@ -391,6 +405,8 @@ class SQLiteMetastore(AbstractDBMetastore):
         # foreign keys
         self.default_table_names: list[str] = []
+        db_file = get_db_file_in_memory(db_file, in_memory)
         self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
         self._init_tables()
@@ -555,10 +571,13 @@ class SQLiteWarehouse(AbstractWarehouse):
         id_generator: "SQLiteIDGenerator",
         db: Optional["SQLiteDatabaseEngine"] = None,
         db_file: Optional[str] = None,
+        in_memory: bool = False,
     ):
         self.schema: DefaultSchema = DefaultSchema()
         super().__init__(id_generator)
+        db_file = get_db_file_in_memory(db_file, in_memory)
         self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
     def __exit__(self, exc_type, exc_value, traceback) -> None:
@@ -631,9 +650,7 @@ class SQLiteWarehouse(AbstractWarehouse):
         self.db.create_table(table, if_not_exists=if_not_exists)
         return table
-    def dataset_rows_select(
-        self, select_query: sqlalchemy.sql.selectable.Select, **kwargs
-    ):
+    def dataset_rows_select(self, select_query: Select, **kwargs):
         rows = self.db.execute(select_query, **kwargs)
         yield from convert_rows_custom_column_types(
             select_query.selected_columns, rows, sqlite_dialect
@@ -751,6 +768,34 @@ class SQLiteWarehouse(AbstractWarehouse):
     ) -> list[str]:
         raise NotImplementedError("Exporting dataset table not implemented for SQLite")
+    def copy_table(
+        self,
+        table: Table,
+        query: Select,
+        progress_cb: Optional[Callable[[int], None]] = None,
+    ) -> None:
+        if "sys__id" in query.selected_columns:
+            col_id = query.selected_columns.sys__id
+        else:
+            col_id = sqlalchemy.column("sys__id")
+        select_ids = query.with_only_columns(col_id)
+        ids = self.db.execute(select_ids).fetchall()
+        select_q = query.with_only_columns(
+            *[c for c in query.selected_columns if c.name != "sys__id"]
+        )
+        for batch in batched_it(ids, 10_000):
+            batch_ids = [row[0] for row in batch]
+            select_q._where_criteria = (col_id.in_(batch_ids),)
+            q = table.insert().from_select(list(select_q.selected_columns), select_q)
+            self.db.execute(q)
+            if progress_cb:
+                progress_cb(len(batch_ids))
     def create_pre_udf_table(self, query: "Select") -> "Table":
         """
         Create a temporary table from a query for use in a UDF.
@@ -762,11 +807,7 @@ class SQLiteWarehouse(AbstractWarehouse):
         ]
         table = self.create_udf_table(columns)
-        select_q = query.with_only_columns(
-            *[c for c in query.selected_columns if c.name != "sys__id"]
-        )
-        self.db.execute(
-            table.insert().from_select(list(select_q.selected_columns), select_q)
-        )
+        with tqdm(desc="Preparing", unit=" rows") as pbar:
+            self.copy_table(table, query, progress_cb=pbar.update)
         return table

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -6,7 +6,7 @@ import random
 import string
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Iterable, Iterator, Sequence
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 from urllib.parse import urlparse
 import attrs
@@ -14,6 +14,7 @@ import sqlalchemy as sa
 from sqlalchemy import Table, case, select
 from sqlalchemy.sql import func
 from sqlalchemy.sql.expression import true
+from tqdm import tqdm
 from datachain.client import Client
 from datachain.data_storage.serializer import Serializable
@@ -901,6 +902,17 @@ class AbstractWarehouse(ABC, Serializable):
         self.db.create_table(tbl, if_not_exists=True)
         return tbl
+    @abstractmethod
+    def copy_table(
+        self,
+        table: Table,
+        query: "Select",
+        progress_cb: Optional[Callable[[int], None]] = None,
+    ) -> None:
+        """
+        Copy the results of a query into a table.
+        """
     @abstractmethod
     def create_pre_udf_table(self, query: "Select") -> "Table":
         """
@@ -928,8 +940,10 @@ class AbstractWarehouse(ABC, Serializable):
         This should be implemented to ensure that the provided tables
         are cleaned up as soon as they are no longer needed.
         """
-        for name in names:
-            self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
+        with tqdm(desc="Cleanup", unit=" tables") as pbar:
+            for name in names:
+                self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
+                pbar.update(1)
     def changed_query(
         self,

datachain/lib/arrow.py CHANGED Viewed

@@ -122,7 +122,7 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
     if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
         return str
     if pa.types.is_list(col_type):
-        return list[_arrow_type_mapper(col_type.value_type)]  # type: ignore[misc]
+        return list[_arrow_type_mapper(col_type.value_type)]  # type: ignore[return-value, misc]
     if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):

datachain/lib/convert/values_to_tuples.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from collections.abc import Sequence
 from typing import Any, Union
-from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
+from datachain.lib.data_model import (
+    DataType,
+    DataTypeNames,
+    DataValuesType,
+    is_chain_type,
+)
 from datachain.lib.utils import DataChainParamsError
@@ -15,7 +20,7 @@ class ValuesToTupleError(DataChainParamsError):
 def values_to_tuples(  # noqa: C901, PLR0912
     ds_name: str = "",
     output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
-    **fr_map,
+    **fr_map: Sequence[DataValuesType],
 ) -> tuple[Any, Any, Any]:
     if output:
         if not isinstance(output, (Sequence, str, dict)):
@@ -47,10 +52,10 @@ def values_to_tuples(  # noqa: C901, PLR0912
                 f" number of signals '{len(fr_map)}'",
             )
-    types_map = {}
+    types_map: dict[str, type] = {}
     length = -1
     for k, v in fr_map.items():
-        if not isinstance(v, Sequence) or isinstance(v, str):
+        if not isinstance(v, Sequence) or isinstance(v, str):  # type: ignore[unreachable]
             raise ValuesToTupleError(ds_name, f"signals '{k}' is not a sequence")
         len_ = len(v)
@@ -64,15 +69,16 @@ def values_to_tuples(  # noqa: C901, PLR0912
             if len_ == 0:
                 raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")
-            typ = type(v[0])
+            first_element = next(iter(v))
+            typ = type(first_element)
             if not is_chain_type(typ):
                 raise ValuesToTupleError(
                     ds_name,
                     f"signal '{k}' has unsupported type '{typ.__name__}'."
                     f" Please use DataModel types: {DataTypeNames}",
                 )
-            if typ is list:
-                types_map[k] = list[type(v[0][0])]  # type: ignore[misc]
+            if isinstance(first_element, list):
+                types_map[k] = list[type(first_element[0])]  # type: ignore[assignment, misc]
             else:
                 types_map[k] = typ
@@ -98,7 +104,7 @@ def values_to_tuples(  # noqa: C901, PLR0912
     if len(output) > 1:  # type: ignore[arg-type]
         tuple_type = tuple(output_types)
         res_type = tuple[tuple_type]  # type: ignore[valid-type]
-        res_values = list(zip(*fr_map.values()))
+        res_values: Sequence[Any] = list(zip(*fr_map.values()))
     else:
         res_type = output_types[0]  # type: ignore[misc]
         res_values = next(iter(fr_map.values()))

datachain/lib/data_model.py CHANGED Viewed

@@ -18,6 +18,7 @@ StandardType = Union[
 ]
 DataType = Union[type[BaseModel], StandardType]
 DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
+DataValuesType = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
 class DataModel(BaseModel):

datachain/lib/dc.py CHANGED Viewed

@@ -309,6 +309,7 @@ class DataChain(DatasetQuery):
         *,
         type: Literal["binary", "text", "image"] = "binary",
         session: Optional[Session] = None,
+        in_memory: bool = False,
         recursive: Optional[bool] = True,
         object_name: str = "file",
         update: bool = False,
@@ -332,7 +333,14 @@ class DataChain(DatasetQuery):
         """
         func = get_file(type)
         return (
-            cls(path, session=session, recursive=recursive, update=update, **kwargs)
+            cls(
+                path,
+                session=session,
+                recursive=recursive,
+                update=update,
+                in_memory=in_memory,
+                **kwargs,
+            )
             .map(**{object_name: func})
             .select(object_name)
         )
@@ -479,7 +487,10 @@ class DataChain(DatasetQuery):
     @classmethod
     def datasets(
-        cls, session: Optional[Session] = None, object_name: str = "dataset"
+        cls,
+        session: Optional[Session] = None,
+        in_memory: bool = False,
+        object_name: str = "dataset",
     ) -> "DataChain":
         """Generate chain with list of registered datasets.
@@ -492,7 +503,7 @@ class DataChain(DatasetQuery):
                 print(f"{ds.name}@v{ds.version}")
             ```
         """
-        session = Session.get(session)
+        session = Session.get(session, in_memory=in_memory)
         catalog = session.catalog
         datasets = [
@@ -502,6 +513,7 @@ class DataChain(DatasetQuery):
         return cls.from_values(
             session=session,
+            in_memory=in_memory,
             output={object_name: DatasetInfo},
             **{object_name: datasets},  # type: ignore[arg-type]
         )
@@ -1142,6 +1154,7 @@ class DataChain(DatasetQuery):
         cls,
         ds_name: str = "",
         session: Optional[Session] = None,
+        in_memory: bool = False,
         output: OutputType = None,
         object_name: str = "",
         **fr_map,
@@ -1158,7 +1171,9 @@ class DataChain(DatasetQuery):
         def _func_fr() -> Iterator[tuple_type]:  # type: ignore[valid-type]
             yield from tuples
-        chain = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD, session=session)
+        chain = DataChain.from_records(
+            DataChain.DEFAULT_FILE_RECORD, session=session, in_memory=in_memory
+        )
         if object_name:
             output = {object_name: DataChain._dict_to_data_model(object_name, output)}  # type: ignore[arg-type]
         return chain.gen(_func_fr, output=output)
@@ -1169,6 +1184,7 @@ class DataChain(DatasetQuery):
         df: "pd.DataFrame",
         name: str = "",
         session: Optional[Session] = None,
+        in_memory: bool = False,
         object_name: str = "",
     ) -> "DataChain":
         """Generate chain from pandas data-frame.
@@ -1196,7 +1212,9 @@ class DataChain(DatasetQuery):
                     f"import from pandas error - '{column}' cannot be a column name",
                 )
-        return cls.from_values(name, session, object_name=object_name, **fr_map)
+        return cls.from_values(
+            name, session, object_name=object_name, in_memory=in_memory, **fr_map
+        )
     def to_pandas(self, flatten=False) -> "pd.DataFrame":
         """Return a pandas DataFrame from the chain.
@@ -1505,6 +1523,7 @@ class DataChain(DatasetQuery):
         cls,
         to_insert: Optional[Union[dict, list[dict]]],
         session: Optional[Session] = None,
+        in_memory: bool = False,
     ) -> "DataChain":
         """Create a DataChain from the provided records. This method can be used for
         programmatically generating a chain in contrast of reading data from storages
@@ -1520,7 +1539,7 @@ class DataChain(DatasetQuery):
             single_record = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
             ```
         """
-        session = Session.get(session)
+        session = Session.get(session, in_memory=in_memory)
         catalog = session.catalog
         name = session.generate_temp_dataset_name()

datachain/lib/listing.py ADDED Viewed

@@ -0,0 +1,111 @@
+import asyncio
+from collections.abc import AsyncIterator, Iterator, Sequence
+from typing import Callable, Optional
+from botocore.exceptions import ClientError
+from fsspec.asyn import get_loop
+from datachain.asyn import iter_over_async
+from datachain.client import Client
+from datachain.error import ClientError as DataChainClientError
+from datachain.lib.file import File
+ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
+DELIMITER = "/"  # Path delimiter
+FETCH_WORKERS = 100
+async def _fetch_dir(client, prefix, result_queue) -> set[str]:
+    path = f"{client.name}/{prefix}"
+    infos = await client.ls_dir(path)
+    files = []
+    subdirs = set()
+    for info in infos:
+        full_path = info["name"]
+        subprefix = client.rel_path(full_path)
+        if prefix.strip(DELIMITER) == subprefix.strip(DELIMITER):
+            continue
+        if info["type"] == "directory":
+            subdirs.add(subprefix)
+        else:
+            files.append(client.info_to_file(info, subprefix))
+    if files:
+        await result_queue.put(files)
+    return subdirs
+async def _fetch(
+    client, start_prefix: str, result_queue: ResultQueue, fetch_workers
+) -> None:
+    loop = get_loop()
+    queue: asyncio.Queue[str] = asyncio.Queue()
+    queue.put_nowait(start_prefix)
+    async def process(queue) -> None:
+        while True:
+            prefix = await queue.get()
+            try:
+                subdirs = await _fetch_dir(client, prefix, result_queue)
+                for subdir in subdirs:
+                    queue.put_nowait(subdir)
+            except Exception:
+                while not queue.empty():
+                    queue.get_nowait()
+                    queue.task_done()
+                raise
+            finally:
+                queue.task_done()
+    try:
+        workers: list[asyncio.Task] = [
+            loop.create_task(process(queue)) for _ in range(fetch_workers)
+        ]
+        # Wait for all fetch tasks to complete
+        await queue.join()
+        # Stop the workers
+        excs = []
+        for worker in workers:
+            if worker.done() and (exc := worker.exception()):
+                excs.append(exc)
+            else:
+                worker.cancel()
+        if excs:
+            raise excs[0]
+    except ClientError as exc:
+        raise DataChainClientError(
+            exc.response.get("Error", {}).get("Message") or exc,
+            exc.response.get("Error", {}).get("Code"),
+        ) from exc
+    finally:
+        # This ensures the progress bar is closed before any exceptions are raised
+        result_queue.put_nowait(None)
+async def _scandir(client, prefix, fetch_workers) -> AsyncIterator:
+    """Recursively goes through dir tree and yields files"""
+    result_queue: ResultQueue = asyncio.Queue()
+    loop = get_loop()
+    main_task = loop.create_task(_fetch(client, prefix, result_queue, fetch_workers))
+    while (files := await result_queue.get()) is not None:
+        for f in files:
+            yield f
+    await main_task
+def list_bucket(uri: str, client_config=None, fetch_workers=FETCH_WORKERS) -> Callable:
+    """
+    Function that returns another generator function that yields File objects
+    from bucket where each File represents one bucket entry.
+    """
+    def list_func() -> Iterator[File]:
+        config = client_config or {}
+        client, path = Client.parse_url(uri, None, **config)  # type: ignore[arg-type]
+        yield from iter_over_async(_scandir(client, path, fetch_workers), get_loop())
+    return list_func

datachain/query/dataset.py CHANGED Viewed

@@ -34,6 +34,7 @@ from sqlalchemy.sql.elements import ColumnClause, ColumnElement
 from sqlalchemy.sql.expression import label
 from sqlalchemy.sql.schema import TableClause
 from sqlalchemy.sql.selectable import Select
+from tqdm import tqdm
 from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
 from datachain.catalog import (
@@ -125,7 +126,10 @@ class QueryGenerator:
     func: QueryGeneratorFunc
     columns: tuple[ColumnElement, ...]
-    def exclude(self, column_names) -> Select:
+    def only(self, column_names: Sequence[str]) -> Select:
+        return self.func(*(c for c in self.columns if c.name in column_names))
+    def exclude(self, column_names: Sequence[str]) -> Select:
         return self.func(*(c for c in self.columns if c.name not in column_names))
     def select(self, column_names=None) -> Select:
@@ -465,6 +469,12 @@ class UDFStep(Step, ABC):
         try:
             if workers:
+                if self.catalog.in_memory:
+                    raise RuntimeError(
+                        "In-memory databases cannot be used with "
+                        "distributed processing."
+                    )
                 from datachain.catalog.loader import get_distributed_class
                 distributor = get_distributed_class(min_task_size=self.min_task_size)
@@ -482,6 +492,10 @@ class UDFStep(Step, ABC):
                 )
             elif processes:
                 # Parallel processing (faster for more CPU-heavy UDFs)
+                if self.catalog.in_memory:
+                    raise RuntimeError(
+                        "In-memory databases cannot be used with parallel processing."
+                    )
                 udf_info = {
                     "udf_data": filtered_cloudpickle_dumps(self.udf),
                     "catalog_init": self.catalog.get_init_params(),
@@ -1049,6 +1063,7 @@ class DatasetQuery:
         indexing_feature_schema: Optional[dict] = None,
         indexing_column_types: Optional[dict[str, Any]] = None,
         update: Optional[bool] = False,
+        in_memory: bool = False,
     ):
         if client_config is None:
             client_config = {}
@@ -1057,7 +1072,7 @@ class DatasetQuery:
             client_config["anon"] = True
         self.session = Session.get(
-            session, catalog=catalog, client_config=client_config
+            session, catalog=catalog, client_config=client_config, in_memory=in_memory
         )
         self.catalog = catalog or self.session.catalog
         self.steps: list[Step] = []
@@ -1648,18 +1663,13 @@ class DatasetQuery:
             dr = self.catalog.warehouse.dataset_rows(dataset)
-            # Exclude the id column and let the db create it to avoid unique
-            # constraint violations.
-            q = query.exclude(("sys__id",))
-            if q._order_by_clauses:
-                # ensuring we have id sorted by order by clause if it exists in a query
-                q = q.add_columns(
-                    f.row_number().over(order_by=q._order_by_clauses).label("sys__id")
+            with tqdm(desc="Saving", unit=" rows") as pbar:
+                self.catalog.warehouse.copy_table(
+                    dr.get_table(),
+                    query.select(),
+                    progress_cb=pbar.update,
                 )
-            cols = tuple(c.name for c in q.selected_columns)
-            insert_q = sqlalchemy.insert(dr.get_table()).from_select(cols, q)
-            self.catalog.warehouse.db.execute(insert_q, **kwargs)
             self.catalog.metastore.update_dataset_status(
                 dataset, DatasetStatus.COMPLETE, version=version
             )

datachain/query/session.py CHANGED Viewed

@@ -46,6 +46,7 @@ class Session:
         name="",
         catalog: Optional["Catalog"] = None,
         client_config: Optional[dict] = None,
+        in_memory: bool = False,
     ):
         if re.match(r"^[0-9a-zA-Z]+$", name) is None:
             raise ValueError(
@@ -58,7 +59,9 @@ class Session:
         session_uuid = uuid4().hex[: self.SESSION_UUID_LEN]
         self.name = f"{name}_{session_uuid}"
         self.is_new_catalog = not catalog
-        self.catalog = catalog or get_catalog(client_config=client_config)
+        self.catalog = catalog or get_catalog(
+            client_config=client_config, in_memory=in_memory
+        )
     def __enter__(self):
         return self
@@ -89,6 +92,7 @@ class Session:
         session: Optional["Session"] = None,
         catalog: Optional["Catalog"] = None,
         client_config: Optional[dict] = None,
+        in_memory: bool = False,
     ) -> "Session":
         """Creates a Session() object from a catalog.
@@ -102,7 +106,10 @@ class Session:
         if cls.GLOBAL_SESSION is None:
             cls.GLOBAL_SESSION_CTX = Session(
-                cls.GLOBAL_SESSION_NAME, catalog, client_config=client_config
+                cls.GLOBAL_SESSION_NAME,
+                catalog,
+                client_config=client_config,
+                in_memory=in_memory,
             )
             cls.GLOBAL_SESSION = cls.GLOBAL_SESSION_CTX.__enter__()
             atexit.register(cls._global_cleanup)

datachain/sql/functions/string.py CHANGED Viewed

@@ -26,5 +26,17 @@ class split(GenericFunction):  # noqa: N801
     inherit_cache = True
+class regexp_replace(GenericFunction):  # noqa: N801
+    """
+    Replaces substring that match a regular expression.
+    """
+    type = String()
+    package = "string"
+    name = "regexp_replace"
+    inherit_cache = True
 compiler_not_implemented(length)
 compiler_not_implemented(split)
+compiler_not_implemented(regexp_replace)

datachain/sql/sqlite/base.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import re
 import sqlite3
 from collections.abc import Iterable
 from datetime import MAXYEAR, MINYEAR, datetime, timezone
@@ -77,6 +78,7 @@ def setup():
     compiles(array.length, "sqlite")(compile_array_length)
     compiles(string.length, "sqlite")(compile_string_length)
     compiles(string.split, "sqlite")(compile_string_split)
+    compiles(string.regexp_replace, "sqlite")(compile_regexp_replace)
     compiles(conditional.greatest, "sqlite")(compile_greatest)
     compiles(conditional.least, "sqlite")(compile_least)
     compiles(Values, "sqlite")(compile_values)
@@ -178,9 +180,15 @@ def register_user_defined_sql_functions() -> None:
     _registered_function_creators["vector_functions"] = create_vector_functions
+    def sqlite_regexp_replace(string: str, pattern: str, replacement: str) -> str:
+        return re.sub(pattern, replacement, string)
     def create_string_functions(conn):
         conn.create_function("split", 2, sqlite_string_split, deterministic=True)
         conn.create_function("split", 3, sqlite_string_split, deterministic=True)
+        conn.create_function(
+            "regexp_replace", 3, sqlite_regexp_replace, deterministic=True
+        )
     _registered_function_creators["string_functions"] = create_string_functions
@@ -221,24 +229,54 @@ def path_name(path):
     return func.ltrim(func.substr(path, func.length(path_parent(path)) + 1), slash)
-def path_file_ext_length(path):
-    name = path_name(path)
+def name_file_ext_length(name):
     expr = func.length(name) - func.length(
         func.rtrim(name, func.replace(name, dot, empty_str))
     )
     return case((func.instr(name, dot) == 0, 0), else_=expr)
+def path_file_ext_length(path):
+    name = path_name(path)
+    return name_file_ext_length(name)
 def path_file_stem(path):
-    return func.rtrim(
-        func.substr(path, 1, func.length(path) - path_file_ext_length(path)), dot
+    path_length = func.length(path)
+    parent_length = func.length(path_parent(path))
+    name_expr = func.rtrim(
+        func.substr(
+            path,
+            1,
+            path_length - name_file_ext_length(path),
+        ),
+        dot,
+    )
+    full_path_expr = func.ltrim(
+        func.rtrim(
+            func.substr(
+                path,
+                parent_length + 1,
+                path_length - parent_length - path_file_ext_length(path),
+            ),
+            dot,
+        ),
+        slash,
     )
+    return case((func.instr(path, slash) == 0, name_expr), else_=full_path_expr)
 def path_file_ext(path):
     return func.substr(path, func.length(path) - path_file_ext_length(path) + 1)
+def compile_regexp_replace(element, compiler, **kwargs):
+    return f"regexp_replace({compiler.process(element.clauses, **kwargs)})"
 def compile_path_parent(element, compiler, **kwargs):
     return compiler.process(path_parent(*element.clauses.clauses), **kwargs)

{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.3.2
+Version: 0.3.4
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -43,7 +43,7 @@ Requires-Dist: Pillow <11,>=10.0.0
 Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests] ; extra == 'dev'
-Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
+Requires-Dist: mypy ==1.11.1 ; extra == 'dev'
 Requires-Dist: types-python-dateutil ; extra == 'dev'
 Requires-Dist: types-pytz ; extra == 'dev'
 Requires-Dist: types-PyYAML ; extra == 'dev'

{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
 datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
-datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
+datachain/asyn.py,sha256=biF8M8fQujtj5xs0VLi8S16eBtzG6kceWlO_NILbCsg,8197
 datachain/cache.py,sha256=wznC2pge6RhlPTaJfBVGjmBc6bxWCPThu4aTFMltvFU,4076
 datachain/cli.py,sha256=DbmI1sXs7-KCQz6RdLE_JAp3XO3yrTSRJ71LdUzx-XE,33099
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
@@ -17,17 +17,17 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
 datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=9fxRJjiM8tK3ZePHFErYqY6LkJFA6bvlp-KHq-_kSYk,80703
+datachain/catalog/catalog.py,sha256=_BRaD261RnCJgXr_DJcDf58XmbjLiuLMSsX97E8k3z8,80771
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
-datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
+datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
 datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
 datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
-datachain/client/azure.py,sha256=3RfDTAI_TszDy9WazHQd3bI3sS2wDFrNXfNqCDewZgE,2214
+datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
 datachain/client/fsspec.py,sha256=G4QTm3KPhlaV74T3gLXJ86345_ak8CH38ezn2ET-oLc,13230
-datachain/client/gcs.py,sha256=Mt77W_l8_fK61gLm4mmxNmENuOM0ETwxdiFp4S8d-_w,4105
-datachain/client/local.py,sha256=SyGnqcrbtSvDK6IJsQa6NxxHwbWaWIP1GLZsQBXg_IA,4939
-datachain/client/s3.py,sha256=GfRZZzNPQPRsYjoef8bbsLbanJPUlCbyGTTK8ojzp8A,6136
+datachain/client/gcs.py,sha256=P_E3mhzhXR9mJ_wc3AYZuczzwOJ0-D3J5qhJXeSU-xk,4518
+datachain/client/local.py,sha256=H8TNY8pi2kA8y9_f_1XLUjJF66f229qC_b2y4xGkzdU,5300
+datachain/client/s3.py,sha256=aQxfMH8G8bUjmHF1-6P90MSkXsU5DgOPEVlKWLu459I,6568
 datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
 datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
 datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
@@ -35,16 +35,17 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
 datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz2ftEz0,55362
 datachain/data_storage/schema.py,sha256=GwJIHkjhrnBxJAV1WvCMM8jiJN5h79LXDyzMmUDtRw0,8523
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
-datachain/data_storage/sqlite.py,sha256=IKd4epEjVxAoQQHsE7WTY4kgOiFyUiWhvaGm-61rJfg,27218
-datachain/data_storage/warehouse.py,sha256=MXYkUG69UK2wbIFsZFvT7rKzXlnSitDMp3Vzj_IIsnA,33089
+datachain/data_storage/sqlite.py,sha256=GEE07ZXTAtzdf53J1UDLscS0xZjukRGlmZzG6q0fZI0,28589
+datachain/data_storage/warehouse.py,sha256=tyJJDxFae6XWgLmOoG0B_MJ_Z_UEMoW_wJb96zzwTtA,33471
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/lib/arrow.py,sha256=R8wDUDEa-5hYjI3HW9cqvOYYJpeeah5lbhFIL3gkmcE,4915
+datachain/lib/arrow.py,sha256=D8N7zCppRdc5sTYT1hNIbROc-sKA_8FN5J_m-KjD3Us,4929
 datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
-datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
+datachain/lib/data_model.py,sha256=ZvtMRMcPpBxI-rOhkXb-ry1PkGYcEFFK1w1wH12vs4g,1718
 datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
-datachain/lib/dc.py,sha256=Q9HL7Axfo9i5hodlkD2GwklN4i0BVULm9_A11ckuj2A,58352
+datachain/lib/dc.py,sha256=0pwNb91GW8MnHLfFd2YvEtEH0n77c3nxp5ozwIyW86o,58827
 datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
 datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
+datachain/lib/listing.py,sha256=nXLmGae_oQke4hnurzzWiHTEjHjWiqqHdB41Wb-hMTk,3521
 datachain/lib/meta_formats.py,sha256=Hels85LJmNCz1aYVJvhymNdAt3qdJ2-qoxsIiUezrow,7198
 datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
 datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
@@ -62,17 +63,17 @@ datachain/lib/convert/flatten.py,sha256=YMoC00BqEy3zSpvCp6Q0DfxihuPmgjUJj1g2cesW
 datachain/lib/convert/python_to_sql.py,sha256=4gplGlr_Kg-Z40OpJUzJiarDWj7pwbUOk-dPOYYCJ9Q,2629
 datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
 datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
-datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
+datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
 datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
 datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
 datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
-datachain/query/dataset.py,sha256=sRKY2it_znlzTNOt_OCRe008rHu0TXMnFwvGsnthSO0,60209
+datachain/query/dataset.py,sha256=7lxlybS7I5IPsgOqMz-W4vS6kWBDHkHQRqBHlIRYRPw,60473
 datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
 datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
 datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
 datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
-datachain/query/session.py,sha256=qTzkXgwMJdJhal3rVt3hdv3x1EXT1IHuXcwkC-Ex0As,4111
+datachain/query/session.py,sha256=PkOLANS0s8KPz4wO17tAab-CMzIt7FK8RPzJiibExds,4290
 datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
 datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
@@ -87,15 +88,15 @@ datachain/sql/functions/array.py,sha256=EB7nJSncUc1PuxlHyzU2gVhF8DuXaxpGlxb5e8X2
 datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
 datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
 datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
-datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
+datachain/sql/functions/string.py,sha256=NSQIpmtQgm68hz3TFJsgHMBuo4MjBNhDSyEIC3pWkT8,916
 datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
-datachain/sql/sqlite/base.py,sha256=w6HbEkGdmNGDnDY3_75E-wDb6qNskVpq0qbHGADsERk,12327
+datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,13364
 datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.3.2.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.3.2.dist-info/METADATA,sha256=i8evXYMe4FgBqxV7TYdWTRuh7MxRT6jfqmzL-tbk_JQ,16789
-datachain-0.3.2.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
-datachain-0.3.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.3.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.3.2.dist-info/RECORD,,
+datachain-0.3.4.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.3.4.dist-info/METADATA,sha256=nV1-yJcDxoWuaM8uSwEzYCpDqSMxhxPle3EZZ98a-LA,16789
+datachain-0.3.4.dist-info/WHEEL,sha256=nCVcAvsfA9TDtwGwhYaRrlPhTLV9m-Ga6mdyDtuwK18,91
+datachain-0.3.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.3.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.3.4.dist-info/RECORD,,

{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (72.2.0)
+Generator: setuptools (73.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.3.2.dist-info → datachain-0.3.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

Potentially problematic release.

datachain 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl