PyPI - datachain - Versions diffs - 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

datachain/__init__.py +4 -0
datachain/asyn.py +11 -12
datachain/cache.py +5 -5
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +276 -354
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +8 -3
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +10 -17
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +42 -27
datachain/cli/commands/ls.py +15 -15
datachain/cli/commands/show.py +2 -2
datachain/cli/parser/__init__.py +3 -43
datachain/cli/parser/job.py +1 -1
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +2 -2
datachain/client/fsspec.py +34 -23
datachain/client/gcs.py +3 -3
datachain/client/http.py +157 -0
datachain/client/local.py +11 -7
datachain/client/s3.py +3 -3
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +2 -0
datachain/data_storage/metastore.py +716 -137
datachain/data_storage/schema.py +20 -27
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +114 -114
datachain/data_storage/warehouse.py +140 -48
datachain/dataset.py +109 -89
datachain/delta.py +117 -42
datachain/diff/__init__.py +25 -33
datachain/error.py +24 -0
datachain/func/aggregate.py +9 -11
datachain/func/array.py +12 -12
datachain/func/base.py +7 -4
datachain/func/conditional.py +9 -13
datachain/func/func.py +63 -45
datachain/func/numeric.py +5 -7
datachain/func/string.py +2 -2
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +18 -15
datachain/lib/audio.py +60 -59
datachain/lib/clip.py +14 -13
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/values_to_tuples.py +151 -53
datachain/lib/data_model.py +23 -19
datachain/lib/dataset_info.py +7 -7
datachain/lib/dc/__init__.py +2 -1
datachain/lib/dc/csv.py +22 -26
datachain/lib/dc/database.py +37 -34
datachain/lib/dc/datachain.py +518 -324
datachain/lib/dc/datasets.py +38 -30
datachain/lib/dc/hf.py +16 -20
datachain/lib/dc/json.py +17 -18
datachain/lib/dc/listings.py +5 -8
datachain/lib/dc/pandas.py +3 -6
datachain/lib/dc/parquet.py +33 -21
datachain/lib/dc/records.py +9 -13
datachain/lib/dc/storage.py +103 -65
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +17 -14
datachain/lib/dc/values.py +3 -6
datachain/lib/file.py +187 -50
datachain/lib/hf.py +7 -5
datachain/lib/image.py +13 -13
datachain/lib/listing.py +5 -5
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +2 -3
datachain/lib/model_store.py +20 -8
datachain/lib/namespaces.py +59 -7
datachain/lib/projects.py +51 -9
datachain/lib/pytorch.py +31 -23
datachain/lib/settings.py +188 -85
datachain/lib/signal_schema.py +302 -64
datachain/lib/text.py +8 -7
datachain/lib/udf.py +103 -63
datachain/lib/udf_signature.py +59 -34
datachain/lib/utils.py +20 -0
datachain/lib/video.py +3 -4
datachain/lib/webdataset.py +31 -36
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +12 -5
datachain/model/bbox.py +3 -1
datachain/namespace.py +22 -3
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +4 -4
datachain/query/batch.py +10 -12
datachain/query/dataset.py +376 -194
datachain/query/dispatch.py +112 -84
datachain/query/metrics.py +3 -4
datachain/query/params.py +2 -3
datachain/query/queue.py +2 -1
datachain/query/schema.py +7 -6
datachain/query/session.py +190 -33
datachain/query/udf.py +9 -6
datachain/remote/studio.py +90 -53
datachain/script_meta.py +12 -12
datachain/sql/sqlite/base.py +37 -25
datachain/sql/sqlite/types.py +1 -1
datachain/sql/types.py +36 -5
datachain/studio.py +49 -40
datachain/toolkit/split.py +31 -10
datachain/utils.py +39 -48
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
datachain-0.39.0.dist-info/RECORD +173 -0
datachain/cli/commands/query.py +0 -54
datachain/query/utils.py +0 -36
datachain-0.30.5.dist-info/RECORD +0 -168
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/lib/convert/values_to_tuples.py CHANGED Viewed

@@ -1,63 +1,177 @@
 import itertools
 from collections.abc import Sequence
-from typing import Any, Union
-from datachain.lib.data_model import (
-    DataType,
-    DataTypeNames,
-    DataValue,
-    is_chain_type,
-)
+from typing import Any
+from datachain.lib.data_model import DataType, DataTypeNames, DataValue, is_chain_type
 from datachain.lib.utils import DataChainParamsError
 class ValuesToTupleError(DataChainParamsError):
     def __init__(self, ds_name: str, msg: str):
+        self.ds_name = ds_name
+        self.msg = msg
         if ds_name:
             ds_name = f"' {ds_name}'"
         super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
+    def __reduce__(self):
+        return ValuesToTupleError, (self.ds_name, self.msg)
-def values_to_tuples(  # noqa: C901, PLR0912
-    ds_name: str = "",
-    output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
-    **fr_map: Sequence[DataValue],
-) -> tuple[Any, Any, Any]:
-    if output:
-        if not isinstance(output, (Sequence, str, dict)):
-            if len(fr_map) != 1:
-                raise ValuesToTupleError(
-                    ds_name,
-                    f"only one output type was specified, {len(fr_map)} expected",
-                )
-            if not isinstance(output, type):
-                raise ValuesToTupleError(
-                    ds_name,
-                    f"output must specify a type while '{output}' was given",
-                )
-            key: str = next(iter(fr_map.keys()))
-            output = {key: output}  # type: ignore[dict-item]
+def _find_first_non_none(sequence: Sequence[Any]) -> Any | None:
+    """Find the first non-None element in a sequence."""
+    try:
+        return next(itertools.dropwhile(lambda i: i is None, sequence))
+    except StopIteration:
+        return None
+def _infer_list_item_type(lst: list) -> type:
+    """Infer the item type of a list, handling None values and nested lists."""
+    if len(lst) == 0:
+        # Default to str when list is empty to avoid generic list
+        return str
+    first_item = _find_first_non_none(lst)
+    if first_item is None:
+        # Default to str when all items are None
+        return str
+    item_type = type(first_item)
+    # Handle nested lists one level deep
+    if isinstance(first_item, list) and len(first_item) > 0:
+        nested_item = _find_first_non_none(first_item)
+        if nested_item is not None:
+            return list[type(nested_item)]  # type: ignore[misc, return-value]
+        # Default to str for nested lists with all None
+        return list[str]  # type: ignore[return-value]
+    return item_type
+def _infer_dict_value_type(dct: dict) -> type:
+    """Infer the value type of a dict, handling None values and list values."""
+    if len(dct) == 0:
+        # Default to str when dict is empty to avoid generic dict values
+        return str
+    # Find first non-None value
+    first_value = None
+    for val in dct.values():
+        if val is not None:
+            first_value = val
+            break
+    if first_value is None:
+        # Default to str when all values are None
+        return str
+    # Handle list values
+    if isinstance(first_value, list) and len(first_value) > 0:
+        list_item = _find_first_non_none(first_value)
+        if list_item is not None:
+            return list[type(list_item)]  # type: ignore[misc, return-value]
+        # Default to str for lists with all None
+        return list[str]  # type: ignore[return-value]
+    return type(first_value)
+def _infer_type_from_sequence(
+    sequence: Sequence[DataValue], signal_name: str, ds_name: str
+) -> type:
+    """
+    Infer the type from a sequence of values.
+    Returns str if all values are None, otherwise infers from the first non-None value.
+    Handles lists and dicts with proper type inference for nested structures.
+    """
+    first_element = _find_first_non_none(sequence)
+    if first_element is None:
+        # Default to str if column is empty or all values are None
+        return str
-        if not isinstance(output, dict):
+    typ = type(first_element)
+    if not is_chain_type(typ):
+        raise ValuesToTupleError(
+            ds_name,
+            f"signal '{signal_name}' has unsupported type '{typ.__name__}'."
+            f" Please use DataModel types: {DataTypeNames}",
+        )
+    if isinstance(first_element, list):
+        item_type = _infer_list_item_type(first_element)
+        return list[item_type]  # type: ignore[valid-type, return-value]
+    if isinstance(first_element, dict):
+        # If the first dict is empty, use str as default key/value types
+        if len(first_element) == 0:
+            return dict[str, str]  # type: ignore[return-value]
+        first_key = next(iter(first_element.keys()))
+        value_type = _infer_dict_value_type(first_element)
+        return dict[type(first_key), value_type]  # type: ignore[misc, return-value]
+    return typ
+def _validate_and_normalize_output(
+    output: DataType | Sequence[str] | dict[str, DataType] | None,
+    fr_map: dict[str, Sequence[DataValue]],
+    ds_name: str,
+) -> dict[str, DataType] | None:
+    """Validate and normalize the output parameter to a dict format."""
+    if not output:
+        return None
+    if not isinstance(output, (Sequence, str, dict)):
+        if len(fr_map) != 1:
             raise ValuesToTupleError(
                 ds_name,
-                "output type must be dict[str, DataType] while "
-                f"'{type(output).__name__}' is given",
+                f"only one output type was specified, {len(fr_map)} expected",
             )
-        if len(output) != len(fr_map):
+        if not isinstance(output, type):
             raise ValuesToTupleError(
                 ds_name,
-                f"number of outputs '{len(output)}' should match"
-                f" number of signals '{len(fr_map)}'",
+                f"output must specify a type while '{output}' was given",
             )
+        key: str = next(iter(fr_map.keys()))
+        return {key: output}  # type: ignore[dict-item]
+    if not isinstance(output, dict):
+        raise ValuesToTupleError(
+            ds_name,
+            "output type must be dict[str, DataType] while "
+            f"'{type(output).__name__}' is given",
+        )
+    if len(output) != len(fr_map):
+        raise ValuesToTupleError(
+            ds_name,
+            f"number of outputs '{len(output)}' should match"
+            f" number of signals '{len(fr_map)}'",
+        )
+    return output  # type: ignore[return-value]
+def values_to_tuples(
+    ds_name: str = "",
+    output: DataType | Sequence[str] | dict[str, DataType] | None = None,
+    **fr_map: Sequence[DataValue],
+) -> tuple[Any, Any, Any]:
+    output = _validate_and_normalize_output(output, fr_map, ds_name)
     types_map: dict[str, type] = {}
     length = -1
     for k, v in fr_map.items():
         if not isinstance(v, Sequence) or isinstance(v, str):  # type: ignore[unreachable]
-            raise ValuesToTupleError(ds_name, f"signals '{k}' is not a sequence")
+            raise ValuesToTupleError(ds_name, f"signal '{k}' is not a sequence")
         len_ = len(v)
         if output:
@@ -70,23 +184,7 @@ def values_to_tuples(  # noqa: C901, PLR0912
             # FIXME: Stops as soon as it finds the first non-None value.
             # If a non-None value appears early, it won't check the remaining items for
             # `None` values.
-            try:
-                first_not_none_element = next(
-                    itertools.dropwhile(lambda i: i is None, v)
-                )
-            except StopIteration:
-                # set default type to `str` if column is empty or all values are `None`
-                typ = str
-            else:
-                typ = type(first_not_none_element)  # type: ignore[assignment]
-                if not is_chain_type(typ):
-                    raise ValuesToTupleError(
-                        ds_name,
-                        f"signal '{k}' has unsupported type '{typ.__name__}'."
-                        f" Please use DataModel types: {DataTypeNames}",
-                    )
-                if isinstance(first_not_none_element, list):
-                    typ = list[type(first_not_none_element[0])]  # type: ignore[assignment, misc]
+            typ = _infer_type_from_sequence(v, k, ds_name)
             types_map[k] = typ
         if length < 0:
@@ -111,7 +209,7 @@ def values_to_tuples(  # noqa: C901, PLR0912
     if len(output) > 1:  # type: ignore[arg-type]
         tuple_type = tuple(output_types)
         res_type = tuple[tuple_type]  # type: ignore[valid-type]
-        res_values: Sequence[Any] = list(zip(*fr_map.values()))
+        res_values: Sequence[Any] = list(zip(*fr_map.values(), strict=False))
     else:
         res_type = output_types[0]  # type: ignore[misc]
         res_values = next(iter(fr_map.values()))

datachain/lib/data_model.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import inspect
+import types
 import uuid
 from collections.abc import Sequence
 from datetime import datetime
-from typing import ClassVar, Optional, Union, get_args, get_origin
+from typing import ClassVar, Union, get_args, get_origin
 from pydantic import AliasChoices, BaseModel, Field, create_model
 from pydantic.fields import FieldInfo
@@ -10,19 +11,19 @@ from pydantic.fields import FieldInfo
 from datachain.lib.model_store import ModelStore
 from datachain.lib.utils import normalize_col_names
-StandardType = Union[
-    type[int],
-    type[str],
-    type[float],
-    type[bool],
-    type[list],
-    type[dict],
-    type[bytes],
-    type[datetime],
-]
-DataType = Union[type[BaseModel], StandardType]
+StandardType = (
+    type[int]
+    | type[str]
+    | type[float]
+    | type[bool]
+    | type[list]
+    | type[dict]
+    | type[bytes]
+    | type[datetime]
+)
+DataType = type[BaseModel] | StandardType
 DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
-DataValue = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
+DataValue = BaseModel | int | str | float | bool | list | dict | bytes | datetime
 class DataModel(BaseModel):
@@ -37,7 +38,7 @@ class DataModel(BaseModel):
         ModelStore.register(cls)
     @staticmethod
-    def register(models: Union[DataType, Sequence[DataType]]):
+    def register(models: DataType | Sequence[DataType]):
         """For registering classes manually. It accepts a single class or a sequence of
         classes."""
         if not isinstance(models, Sequence):
@@ -63,8 +64,11 @@ def is_chain_type(t: type) -> bool:
     if orig is list and len(args) == 1:
         return is_chain_type(get_args(t)[0])
-    if orig is Union and len(args) == 2 and (type(None) in args):
-        return is_chain_type(args[0])
+    if orig is dict and len(args) == 2:
+        return is_chain_type(args[0]) and is_chain_type(args[1])
+    if orig in (Union, types.UnionType) and len(args) == 2 and (type(None) in args):
+        return is_chain_type(args[0] if args[1] is type(None) else args[1])
     return False
@@ -72,19 +76,19 @@ def is_chain_type(t: type) -> bool:
 def dict_to_data_model(
     name: str,
     data_dict: dict[str, DataType],
-    original_names: Optional[list[str]] = None,
+    original_names: list[str] | None = None,
 ) -> type[BaseModel]:
     if not original_names:
         # Gets a map of a normalized_name -> original_name
         columns = normalize_col_names(list(data_dict))
-        data_dict = dict(zip(columns.keys(), data_dict.values()))
+        data_dict = dict(zip(columns.keys(), data_dict.values(), strict=False))
         original_names = list(columns.values())
     fields = {
         name: (
             anno
             if inspect.isclass(anno) and issubclass(anno, BaseModel)
-            else Optional[anno],
+            else anno | None,
             Field(
                 validation_alias=AliasChoices(name, original_names[idx] or name),
                 default=None,

datachain/lib/dataset_info.py CHANGED Viewed

@@ -1,10 +1,10 @@
-import json
 from datetime import datetime
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 from uuid import uuid4
 from pydantic import Field, field_validator
+from datachain import json
 from datachain.dataset import (
     DEFAULT_DATASET_VERSION,
     DatasetListRecord,
@@ -28,9 +28,9 @@ class DatasetInfo(DataModel):
     version: str = Field(default=DEFAULT_DATASET_VERSION)
     status: int = Field(default=DatasetStatus.CREATED)
     created_at: datetime = Field(default=TIME_ZERO)
-    finished_at: Optional[datetime] = Field(default=None)
-    num_objects: Optional[int] = Field(default=None)
-    size: Optional[int] = Field(default=None)
+    finished_at: datetime | None = Field(default=None)
+    num_objects: int | None = Field(default=None)
+    size: int | None = Field(default=None)
     params: dict[str, str] = Field(default={})
     metrics: dict[str, Any] = Field(default={})
     error_message: str = Field(default="")
@@ -59,7 +59,7 @@ class DatasetInfo(DataModel):
     @staticmethod
     def _validate_dict(
-        v: Optional[Union[str, dict]],
+        v: str | dict | None,
     ) -> dict:
         if v is None or v == "":
             return {}
@@ -88,7 +88,7 @@ class DatasetInfo(DataModel):
         cls,
         dataset: DatasetListRecord,
         version: DatasetListVersion,
-        job: Optional[Job],
+        job: Job | None,
     ) -> "Self":
         return cls(
             uuid=version.uuid,

datachain/lib/dc/__init__.py CHANGED Viewed

@@ -9,7 +9,7 @@ from .pandas import read_pandas
 from .parquet import read_parquet
 from .records import read_records
 from .storage import read_storage
-from .utils import DatasetMergeError, DatasetPrepareError, Sys, is_studio
+from .utils import DatasetMergeError, DatasetPrepareError, Sys, is_local, is_studio
 from .values import read_values
 __all__ = [
@@ -21,6 +21,7 @@ __all__ = [
     "Sys",
     "datasets",
     "delete_dataset",
+    "is_local",
     "is_studio",
     "listings",
     "move_dataset",

datachain/lib/dc/csv.py CHANGED Viewed

@@ -1,10 +1,6 @@
-from collections.abc import Sequence
-from typing import (
-    TYPE_CHECKING,
-    Callable,
-    Optional,
-    Union,
-)
+import os
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING
 from datachain.lib.dc.utils import DatasetPrepareError, OutputType
 from datachain.lib.model_store import ModelStore
@@ -17,38 +13,38 @@ if TYPE_CHECKING:
 def read_csv(
-    path,
-    delimiter: Optional[str] = None,
+    path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
+    delimiter: str | None = None,
     header: bool = True,
     output: OutputType = None,
     column: str = "",
     model_name: str = "",
     source: bool = True,
-    nrows=None,
-    session: Optional[Session] = None,
-    settings: Optional[dict] = None,
-    column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
-    parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
+    nrows: int | None = None,
+    session: Session | None = None,
+    settings: dict | None = None,
+    column_types: dict[str, "str | ArrowDataType"] | None = None,
+    parse_options: dict[str, str | bool | Callable] | None = None,
     **kwargs,
 ) -> "DataChain":
     """Generate chain from csv files.
     Parameters:
-        path : Storage URI with directory. URI must start with storage prefix such
+        path: Storage URI with directory. URI must start with storage prefix such
             as `s3://`, `gs://`, `az://` or "file:///".
-        delimiter : Character for delimiting columns. Takes precedence if also
+        delimiter: Character for delimiting columns. Takes precedence if also
             specified in `parse_options`. Defaults to ",".
-        header : Whether the files include a header row.
-        output : Dictionary or feature class defining column names and their
+        header: Whether the files include a header row.
+        output: Dictionary or feature class defining column names and their
             corresponding types. List of column names is also accepted, in which
             case types will be inferred.
-        column : Created column name.
-        model_name : Generated model name.
-        source : Whether to include info about the source file.
-        nrows : Optional row limit.
-        session : Session to use for the chain.
-        settings : Settings to use for the chain.
-        column_types : Dictionary of column names and their corresponding types.
+        column: Created column name.
+        model_name: Generated model name.
+        source: Whether to include info about the source file.
+        nrows: Optional row limit.
+        session: Session to use for the chain.
+        settings: Settings to use for the chain.
+        column_types: Dictionary of column names and their corresponding types.
             It is passed to CSV reader and for each column specified type auto
             inference is disabled.
         parse_options: Tells the parser how to process lines.
@@ -67,7 +63,7 @@ def read_csv(
         chain = dc.read_csv("s3://mybucket/dir")
         ```
     """
-    from pandas.io.parsers.readers import STR_NA_VALUES
+    from pandas._libs.parsers import STR_NA_VALUES
     from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
     from pyarrow.dataset import CsvFileFormat
     from pyarrow.lib import type_for_alias

datachain/lib/dc/database.py CHANGED Viewed

@@ -2,7 +2,8 @@ import contextlib
 import itertools
 import os
 import sqlite3
-from typing import TYPE_CHECKING, Any, Optional, Union
+from collections.abc import Iterator, Mapping, Sequence
+from typing import TYPE_CHECKING, Any
 import sqlalchemy
@@ -12,8 +13,6 @@ from datachain.utils import batched
 DEFAULT_DATABASE_BATCH_SIZE = 10_000
 if TYPE_CHECKING:
-    from collections.abc import Iterator, Mapping, Sequence
     import sqlalchemy.orm  # noqa: TC004
     from datachain.lib.data_model import DataType
@@ -21,21 +20,21 @@ if TYPE_CHECKING:
     from .datachain import DataChain
-    ConnectionType = Union[
-        str,
-        sqlalchemy.engine.URL,
-        sqlalchemy.engine.interfaces.Connectable,
-        sqlalchemy.engine.Engine,
-        sqlalchemy.engine.Connection,
-        sqlalchemy.orm.Session,
-        sqlite3.Connection,
-    ]
+    ConnectionType = (
+        str
+        | sqlalchemy.engine.URL
+        | sqlalchemy.engine.interfaces.Connectable
+        | sqlalchemy.engine.Engine
+        | sqlalchemy.engine.Connection
+        | sqlalchemy.orm.Session
+        | sqlite3.Connection
+    )
 @contextlib.contextmanager
 def _connect(
     connection: "ConnectionType",
-) -> "Iterator[sqlalchemy.engine.Connection]":
+) -> Iterator[sqlalchemy.engine.Connection]:
     import sqlalchemy.orm
     with contextlib.ExitStack() as stack:
@@ -46,10 +45,14 @@ def _connect(
             yield stack.enter_context(engine.connect())
         elif isinstance(connection, sqlite3.Connection):
             engine = sqlalchemy.create_engine(
-                "sqlite://", creator=lambda: connection, **engine_kwargs
+                "sqlite://",
+                creator=lambda: connection,
+                poolclass=sqlalchemy.pool.StaticPool,
+                **engine_kwargs,
             )
-            # do not close the connection, as it is managed by the caller
-            yield engine.connect()
+            # Close only the SQLAlchemy connection wrapper; the underlying
+            # sqlite3 connection remains managed by the caller via StaticPool.
+            yield stack.enter_context(engine.connect())
         elif isinstance(connection, sqlalchemy.Engine):
             yield stack.enter_context(connection.connect())
         elif isinstance(connection, sqlalchemy.Connection):
@@ -73,10 +76,10 @@ def to_database(
     table_name: str,
     connection: "ConnectionType",
     *,
-    batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
-    on_conflict: Optional[str] = None,
-    conflict_columns: Optional[list[str]] = None,
-    column_mapping: Optional[dict[str, Optional[str]]] = None,
+    batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
+    on_conflict: str | None = None,
+    conflict_columns: list[str] | None = None,
+    column_mapping: dict[str, str | None] | None = None,
 ) -> int:
     """
     Implementation function for exporting DataChain to database tables.
@@ -124,7 +127,7 @@ def to_database(
                 table.create(conn, checkfirst=True)
                 rows_iter = chain._leaf_values()
-                for batch in batched(rows_iter, batch_rows):
+                for batch in batched(rows_iter, batch_size):
                     rows_affected = _process_batch(
                         conn,
                         table,
@@ -150,8 +153,8 @@ def to_database(
 def _normalize_column_mapping(
-    column_mapping: dict[str, Optional[str]],
-) -> dict[str, Optional[str]]:
+    column_mapping: dict[str, str | None],
+) -> dict[str, str | None]:
     """
     Convert column mapping keys from DataChain format (dots) to database format
     (double underscores).
@@ -163,7 +166,7 @@ def _normalize_column_mapping(
     if not column_mapping:
         return {}
-    normalized_mapping: dict[str, Optional[str]] = {}
+    normalized_mapping: dict[str, str | None] = {}
     original_keys: dict[str, str] = {}
     for key, value in column_mapping.items():
         db_key = ColumnMeta.to_db_name(key)
@@ -181,7 +184,7 @@ def _normalize_column_mapping(
         from collections import defaultdict
         default_factory = column_mapping.default_factory
-        result: dict[str, Optional[str]] = defaultdict(default_factory)
+        result: dict[str, str | None] = defaultdict(default_factory)
         result.update(normalized_mapping)
         return result
@@ -189,8 +192,8 @@ def _normalize_column_mapping(
 def _normalize_conflict_columns(
-    conflict_columns: Optional[list[str]], column_mapping: dict[str, Optional[str]]
-) -> Optional[list[str]]:
+    conflict_columns: list[str] | None, column_mapping: dict[str, str | None]
+) -> list[str] | None:
     """
     Normalize conflict_columns by converting DataChain format to database format
     and applying column mapping.
@@ -297,15 +300,15 @@ def _process_batch(
 def read_database(
-    query: Union[str, "sqlalchemy.sql.expression.Executable"],
+    query: "str | sqlalchemy.sql.expression.Executable",
     connection: "ConnectionType",
-    params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
+    params: Sequence[Mapping[str, Any]] | Mapping[str, Any] | None = None,
     *,
-    output: Optional["dict[str, DataType]"] = None,
-    session: Optional["Session"] = None,
-    settings: Optional[dict] = None,
+    output: dict[str, "DataType"] | None = None,
+    session: "Session | None" = None,
+    settings: dict | None = None,
     in_memory: bool = False,
-    infer_schema_length: Optional[int] = 100,
+    infer_schema_length: int | None = 100,
 ) -> "DataChain":
     """
     Read the results of a SQL query into a DataChain, using a given database connection.
@@ -382,7 +385,7 @@ def read_database(
 def _infer_schema(
     result: "sqlalchemy.engine.Result",
     to_infer: list[str],
-    infer_schema_length: Optional[int] = 100,
+    infer_schema_length: int | None = 100,
 ) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
     from datachain.lib.convert.values_to_tuples import values_to_tuples

datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl