PyPI - datachain - Versions diffs - 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl - Mend

datachain 0.3.7py3-none-any.whl → 0.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (22) hide show

datachain/catalog/catalog.py +2 -92
datachain/cli.py +0 -37
datachain/lib/arrow.py +5 -5
datachain/lib/clip.py +14 -3
datachain/lib/convert/python_to_sql.py +9 -0
datachain/lib/data_model.py +10 -1
datachain/lib/dc.py +135 -39
datachain/lib/hf.py +166 -0
datachain/lib/image.py +9 -1
datachain/lib/pytorch.py +1 -2
datachain/lib/signal_schema.py +124 -20
datachain/lib/text.py +4 -0
datachain/lib/udf.py +14 -20
datachain/lib/webdataset.py +1 -1
datachain/query/dataset.py +24 -9
datachain/query/session.py +5 -3
{datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/METADATA +19 -15
{datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/RECORD +22 -21
{datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/WHEEL +1 -1
{datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/LICENSE +0 -0
{datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/entry_points.txt +0 -0
{datachain-0.3.7.dist-info → datachain-0.3.9.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -1540,87 +1540,6 @@ class Catalog:
         dataset = self.get_dataset(name)
         return self.update_dataset(dataset, **update_data)
-    def merge_datasets(
-        self,
-        src: DatasetRecord,
-        dst: DatasetRecord,
-        src_version: int,
-        dst_version: Optional[int] = None,
-    ) -> DatasetRecord:
-        """
-        Merges records from source to destination dataset.
-        It will create new version
-        of a dataset with records merged from old version and the source, unless
-        existing version is specified for destination in which case it must
-        be in non final status as datasets are immutable
-        """
-        if (
-            dst_version
-            and not dst.is_valid_next_version(dst_version)
-            and dst.get_version(dst_version).is_final_status()
-        ):
-            raise DatasetInvalidVersionError(
-                f"Version {dst_version} must be higher than the current latest one"
-            )
-        src_dep = self.get_dataset_dependencies(src.name, src_version)
-        dst_dep = self.get_dataset_dependencies(
-            dst.name,
-            dst.latest_version,  # type: ignore[arg-type]
-        )
-        if dst.has_version(dst_version):  # type: ignore[arg-type]
-            # case where we don't create new version, but append to the existing one
-            self.warehouse.merge_dataset_rows(
-                src,
-                dst,
-                src_version,
-                dst_version=dst_version,  # type: ignore[arg-type]
-            )
-            merged_schema = src.serialized_schema | dst.serialized_schema
-            self.update_dataset(dst, schema=merged_schema)
-            self.update_dataset_version_with_warehouse_info(
-                dst,
-                dst_version,  # type: ignore[arg-type]
-                schema=merged_schema,
-            )
-            for dep in src_dep:
-                if dep and dep not in dst_dep:
-                    self.metastore.add_dependency(
-                        dep,
-                        dst.name,
-                        dst_version,  # type: ignore[arg-type]
-                    )
-        else:
-            # case where we create new version of merged results
-            src_dr = self.warehouse.dataset_rows(src, src_version)
-            dst_dr = self.warehouse.dataset_rows(dst)
-            merge_result_columns = list(
-                {
-                    c.name: c for c in list(src_dr.table.c) + list(dst_dr.table.c)
-                }.values()
-            )
-            dst_version = dst_version or dst.next_version
-            dst = self.create_new_dataset_version(
-                dst,
-                dst_version,
-                columns=merge_result_columns,
-            )
-            self.warehouse.merge_dataset_rows(
-                src,
-                dst,
-                src_version,
-                dst_version,
-            )
-            self.update_dataset_version_with_warehouse_info(dst, dst_version)
-            for dep in set(src_dep + dst_dep):
-                if dep:
-                    self.metastore.add_dependency(dep, dst.name, dst_version)
-        return dst
     def get_file_signals(
         self, dataset_name: str, dataset_version: int, row: RowDict
     ) -> Optional[dict]:
@@ -1641,17 +1560,8 @@ class Catalog:
         version = self.get_dataset(dataset_name).get_version(dataset_version)
         file_signals_values = {}
-        file_schemas = {}
-        # TODO: To remove after we properly fix deserialization
-        for signal, type_name in version.feature_schema.items():
-            from datachain.lib.model_store import ModelStore
-            type_name_parsed, v = ModelStore.parse_name_version(type_name)
-            fr = ModelStore.get(type_name_parsed, v)
-            if fr and issubclass(fr, File):
-                file_schemas[signal] = type_name
-        schema = SignalSchema.deserialize(file_schemas)
+        schema = SignalSchema.deserialize(version.feature_schema)
         for file_signals in schema.get_signals(File):
             prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
             file_signals_values[file_signals] = {
@@ -1997,7 +1907,7 @@ class Catalog:
         """
         from datachain.query.dataset import ExecutionResult
-        feature_file = tempfile.NamedTemporaryFile(
+        feature_file = tempfile.NamedTemporaryFile(  # noqa: SIM115
             dir=os.getcwd(), suffix=".py", delete=False
         )
         _, feature_module = os.path.split(feature_file.name)

datachain/cli.py CHANGED Viewed

@@ -336,36 +336,6 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Display size using powers of 1000 not 1024",
     )
-    parse_merge_datasets = subp.add_parser(
-        "merge-datasets", parents=[parent_parser], description="Merges datasets"
-    )
-    parse_merge_datasets.add_argument(
-        "--src",
-        action="store",
-        default=None,
-        help="Source dataset name",
-    )
-    parse_merge_datasets.add_argument(
-        "--dst",
-        action="store",
-        default=None,
-        help="Destination dataset name",
-    )
-    parse_merge_datasets.add_argument(
-        "--src-version",
-        action="store",
-        default=None,
-        type=int,
-        help="Source dataset version",
-    )
-    parse_merge_datasets.add_argument(
-        "--dst-version",
-        action="store",
-        default=None,
-        type=int,
-        help="Destination dataset version",
-    )
     parse_ls = subp.add_parser(
         "ls", parents=[parent_parser], description="List storage contents"
     )
@@ -996,13 +966,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 new_name=args.new_name,
                 labels=args.labels,
             )
-        elif args.command == "merge-datasets":
-            catalog.merge_datasets(
-                catalog.get_dataset(args.src),
-                catalog.get_dataset(args.dst),
-                args.src_version,
-                dst_version=args.dst_version,
-            )
         elif args.command == "ls":
             ls(
                 args.sources,

datachain/lib/arrow.py CHANGED Viewed

@@ -95,7 +95,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
         if not column:
             column = f"c{default_column}"
             default_column += 1
-        dtype = _arrow_type_mapper(field.type)  # type: ignore[assignment]
+        dtype = arrow_type_mapper(field.type)  # type: ignore[assignment]
         if field.nullable:
             dtype = Optional[dtype]  # type: ignore[assignment]
         output[column] = dtype
@@ -103,7 +103,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
     return output
-def _arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
+def arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
     """Convert pyarrow types to basic types."""
     from datetime import datetime
@@ -122,16 +122,16 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
     if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
         return str
     if pa.types.is_list(col_type):
-        return list[_arrow_type_mapper(col_type.value_type)]  # type: ignore[return-value, misc]
+        return list[arrow_type_mapper(col_type.value_type)]  # type: ignore[return-value, misc]
     if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):
-        return _arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
+        return arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
     raise TypeError(f"{col_type!r} datatypes not supported")
 def _nrows_file(file: File, nrows: int) -> str:
-    tf = NamedTemporaryFile(delete=False)
+    tf = NamedTemporaryFile(delete=False)  # noqa: SIM115
     with file.open(mode="r") as reader:
         with open(tf.name, "a") as writer:
             for row, line in enumerate(reader):

datachain/lib/clip.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import inspect
-from typing import TYPE_CHECKING, Any, Callable, Literal, Union
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
 import torch
 from transformers.modeling_utils import PreTrainedModel
@@ -39,6 +39,7 @@ def clip_similarity_scores(
     tokenizer: Callable,
     prob: bool = False,
     image_to_text: bool = True,
+    device: Optional[Union[str, torch.device]] = None,
 ) -> list[list[float]]:
     """
     Calculate CLIP similarity scores between one or more images and/or text.
@@ -52,6 +53,7 @@ def clip_similarity_scores(
         prob : Compute softmax probabilities.
         image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
             if only one of images or text provided.
+        device : Device to use. Defaults is None - use model's device.
     Example:
@@ -130,17 +132,26 @@ def clip_similarity_scores(
         ```
     """
+    if device is None:
+        if hasattr(model, "device"):
+            device = model.device
+        else:
+            device = next(model.parameters()).device
+    else:
+        model = model.to(device)
     with torch.no_grad():
         if images is not None:
             encoder = _get_encoder(model, "image")
             image_features = convert_images(
-                images, transform=preprocess, encoder=encoder
+                images, transform=preprocess, encoder=encoder, device=device
             )
             image_features /= image_features.norm(dim=-1, keepdim=True)  # type: ignore[union-attr]
         if text is not None:
             encoder = _get_encoder(model, "text")
-            text_features = convert_text(text, tokenizer, encoder=encoder)
+            text_features = convert_text(
+                text, tokenizer, encoder=encoder, device=device
+            )
             text_features /= text_features.norm(dim=-1, keepdim=True)  # type: ignore[union-attr]
         if images is not None and text is not None:

datachain/lib/convert/python_to_sql.py CHANGED Viewed

@@ -73,6 +73,9 @@ def python_to_sql(typ):  # noqa: PLR0911
         if len(args) == 2 and (type(None) in args):
             return python_to_sql(args[0])
+        if _is_union_str_literal(orig, args):
+            return String
         if _is_json_inside_union(orig, args):
             return JSON
@@ -94,3 +97,9 @@ def _is_json_inside_union(orig, args) -> bool:
         if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
             return True
     return False
+def _is_union_str_literal(orig, args) -> bool:
+    if orig != Union:
+        return False
+    return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)

datachain/lib/data_model.py CHANGED Viewed

@@ -2,7 +2,7 @@ from collections.abc import Sequence
 from datetime import datetime
 from typing import ClassVar, Union, get_args, get_origin
-from pydantic import BaseModel
+from pydantic import BaseModel, create_model
 from datachain.lib.model_store import ModelStore
@@ -57,3 +57,12 @@ def is_chain_type(t: type) -> bool:
         return is_chain_type(args[0])
     return False
+def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
+    fields = {name: (anno, ...) for name, anno in data_dict.items()}
+    return create_model(
+        name,
+        __base__=(DataModel,),  # type: ignore[call-overload]
+        **fields,
+    )  # type: ignore[call-overload]

datachain/lib/dc.py CHANGED Viewed

@@ -18,14 +18,13 @@ from typing import (
 import pandas as pd
 import sqlalchemy
-from pydantic import BaseModel, create_model
+from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
 from sqlalchemy.sql.sqltypes import NullType
-from datachain import DataModel
 from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.convert.values_to_tuples import values_to_tuples
-from datachain.lib.data_model import DataType
+from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import ExportPlacement as FileExportPlacement
 from datachain.lib.file import File, IndexedFile, get_file
@@ -55,6 +54,8 @@ from datachain.utils import inside_notebook
 if TYPE_CHECKING:
     from typing_extensions import Concatenate, ParamSpec, Self
+    from datachain.lib.hf import HFDatasetType
     P = ParamSpec("P")
 C = Column
@@ -77,12 +78,12 @@ def resolve_columns(
     @wraps(method)
     def _inner(self: D, *args: "P.args", **kwargs: "P.kwargs") -> D:
         resolved_args = self.signals_schema.resolve(
-            *[arg for arg in args if not isinstance(arg, GenericFunction)]
+            *[arg for arg in args if not isinstance(arg, GenericFunction)]  # type: ignore[arg-type]
         ).db_signals()
         for idx, arg in enumerate(args):
             if isinstance(arg, GenericFunction):
-                resolved_args.insert(idx, arg)
+                resolved_args.insert(idx, arg)  # type: ignore[arg-type]
         return method(self, *resolved_args, **kwargs)
@@ -208,23 +209,28 @@ class DataChain(DatasetQuery):
         "size": 0,
     }
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, settings: Optional[dict] = None, **kwargs):
         """This method needs to be redefined as a part of Dataset and DataChain
         decoupling.
         """
-        super().__init__(
+        super().__init__(  # type: ignore[misc]
             *args,
             **kwargs,
             indexing_column_types=File._datachain_column_types,
         )
-        self._settings = Settings()
-        self._setup = {}
+        if settings:
+            self._settings = Settings(**settings)
+        else:
+            self._settings = Settings()
+        self._setup: dict = {}
         self.signals_schema = SignalSchema({"sys": Sys})
         if self.feature_schema:
             self.signals_schema |= SignalSchema.deserialize(self.feature_schema)
         else:
-            self.signals_schema |= SignalSchema.from_column_types(self.column_types)
+            self.signals_schema |= SignalSchema.from_column_types(
+                self.column_types or {}
+            )
         self._sys = False
@@ -309,6 +315,7 @@ class DataChain(DatasetQuery):
         *,
         type: Literal["binary", "text", "image"] = "binary",
         session: Optional[Session] = None,
+        settings: Optional[dict] = None,
         in_memory: bool = False,
         recursive: Optional[bool] = True,
         object_name: str = "file",
@@ -336,6 +343,7 @@ class DataChain(DatasetQuery):
             cls(
                 path,
                 session=session,
+                settings=settings,
                 recursive=recursive,
                 update=update,
                 in_memory=in_memory,
@@ -489,6 +497,7 @@ class DataChain(DatasetQuery):
     def datasets(
         cls,
         session: Optional[Session] = None,
+        settings: Optional[dict] = None,
         in_memory: bool = False,
         object_name: str = "dataset",
     ) -> "DataChain":
@@ -513,6 +522,7 @@ class DataChain(DatasetQuery):
         return cls.from_values(
             session=session,
+            settings=settings,
             in_memory=in_memory,
             output={object_name: DatasetInfo},
             **{object_name: datasets},  # type: ignore[arg-type]
@@ -895,7 +905,7 @@ class DataChain(DatasetQuery):
             if isinstance(value, Column):
                 # renaming existing column
                 for signal in schema.db_signals(name=value.name, as_columns=True):
-                    mutated[signal.name.replace(value.name, name, 1)] = signal
+                    mutated[signal.name.replace(value.name, name, 1)] = signal  # type: ignore[union-attr]
             else:
                 # adding new signal
                 mutated[name] = value
@@ -1086,7 +1096,7 @@ class DataChain(DatasetQuery):
             )
         signals_schema = self.signals_schema.clone_without_sys_signals()
-        on_columns = signals_schema.resolve(*on).db_signals()
+        on_columns: list[str] = signals_schema.resolve(*on).db_signals()  # type: ignore[assignment]
         right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
         if right_on is not None:
@@ -1105,7 +1115,9 @@ class DataChain(DatasetQuery):
                     on, right_on, "'on' and 'right_on' must have the same length'"
                 )
-            right_on_columns = right_signals_schema.resolve(*right_on).db_signals()
+            right_on_columns: list[str] = right_signals_schema.resolve(
+                *right_on
+            ).db_signals()  # type: ignore[assignment]
             if len(right_on_columns) != len(on_columns):
                 on_str = ", ".join(right_on_columns)
@@ -1141,17 +1153,35 @@ class DataChain(DatasetQuery):
         self,
         other: "DataChain",
         on: Optional[Union[str, Sequence[str]]] = None,
+        right_on: Optional[Union[str, Sequence[str]]] = None,
     ) -> "Self":
         """Remove rows that appear in another chain.
         Parameters:
             other: chain whose rows will be removed from `self`
-            on: columns to consider for determining row equality. If unspecified,
-                defaults to all common columns between `self` and `other`.
+            on: columns to consider for determining row equality in `self`.
+                If unspecified, defaults to all common columns
+                between `self` and `other`.
+            right_on: columns to consider for determining row equality in `other`.
+                If unspecified, defaults to the same values as `on`.
         """
         if isinstance(on, str):
+            if not on:
+                raise DataChainParamsError("'on' cannot be an empty string")
             on = [on]
-        if on is None:
+        elif isinstance(on, Sequence):
+            if not on or any(not col for col in on):
+                raise DataChainParamsError("'on' cannot contain empty strings")
+        if isinstance(right_on, str):
+            if not right_on:
+                raise DataChainParamsError("'right_on' cannot be an empty string")
+            right_on = [right_on]
+        elif isinstance(right_on, Sequence):
+            if not right_on or any(not col for col in right_on):
+                raise DataChainParamsError("'right_on' cannot contain empty strings")
+        if on is None and right_on is None:
             other_columns = set(other._effective_signals_schema.db_signals())
             signals = [
                 c
@@ -1160,16 +1190,29 @@ class DataChain(DatasetQuery):
             ]
             if not signals:
                 raise DataChainParamsError("subtract(): no common columns")
-        elif not isinstance(on, Sequence):
-            raise TypeError(
-                f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
-            )
-        elif not on:
+        elif on is not None and right_on is None:
+            right_on = on
+            signals = list(self.signals_schema.resolve(*on).db_signals())
+        elif on is None and right_on is not None:
             raise DataChainParamsError(
-                "'on' cannot be empty",
+                "'on' must be specified when 'right_on' is provided"
             )
         else:
-            signals = self.signals_schema.resolve(*on).db_signals()
+            if not isinstance(on, Sequence) or not isinstance(right_on, Sequence):
+                raise TypeError(
+                    "'on' and 'right_on' must be 'str' or 'Sequence' object"
+                )
+            if len(on) != len(right_on):
+                raise DataChainParamsError(
+                    "'on' and 'right_on' must have the same length"
+                )
+            signals = list(
+                zip(
+                    self.signals_schema.resolve(*on).db_signals(),
+                    other.signals_schema.resolve(*right_on).db_signals(),
+                )  # type: ignore[arg-type]
+            )
         return super()._subtract(other, signals)  # type: ignore[arg-type]
     @classmethod
@@ -1177,6 +1220,7 @@ class DataChain(DatasetQuery):
         cls,
         ds_name: str = "",
         session: Optional[Session] = None,
+        settings: Optional[dict] = None,
         in_memory: bool = False,
         output: OutputType = None,
         object_name: str = "",
@@ -1195,10 +1239,13 @@ class DataChain(DatasetQuery):
             yield from tuples
         chain = DataChain.from_records(
-            DataChain.DEFAULT_FILE_RECORD, session=session, in_memory=in_memory
+            DataChain.DEFAULT_FILE_RECORD,
+            session=session,
+            settings=settings,
+            in_memory=in_memory,
         )
         if object_name:
-            output = {object_name: DataChain._dict_to_data_model(object_name, output)}  # type: ignore[arg-type]
+            output = {object_name: dict_to_data_model(object_name, output)}  # type: ignore[arg-type]
         return chain.gen(_func_fr, output=output)
     @classmethod
@@ -1207,6 +1254,7 @@ class DataChain(DatasetQuery):
         df: "pd.DataFrame",
         name: str = "",
         session: Optional[Session] = None,
+        settings: Optional[dict] = None,
         in_memory: bool = False,
         object_name: str = "",
     ) -> "DataChain":
@@ -1236,7 +1284,12 @@ class DataChain(DatasetQuery):
                 )
         return cls.from_values(
-            name, session, object_name=object_name, in_memory=in_memory, **fr_map
+            name,
+            session,
+            settings=settings,
+            object_name=object_name,
+            in_memory=in_memory,
+            **fr_map,
         )
     def to_pandas(self, flatten=False) -> "pd.DataFrame":
@@ -1306,6 +1359,59 @@ class DataChain(DatasetQuery):
         if len(df) == limit:
             print(f"\n[Limited by {len(df)} rows]")
+    @classmethod
+    def from_hf(
+        cls,
+        dataset: Union[str, "HFDatasetType"],
+        *args,
+        session: Optional[Session] = None,
+        settings: Optional[dict] = None,
+        object_name: str = "",
+        model_name: str = "",
+        **kwargs,
+    ) -> "DataChain":
+        """Generate chain from huggingface hub dataset.
+        Parameters:
+            dataset : Path or name of the dataset to read from Hugging Face Hub,
+                or an instance of `datasets.Dataset`-like object.
+            session : Session to use for the chain.
+            settings : Settings to use for the chain.
+            object_name : Generated object column name.
+            model_name : Generated model name.
+            kwargs : Parameters to pass to datasets.load_dataset.
+        Example:
+            Load from Hugging Face Hub:
+            ```py
+            DataChain.from_hf("beans", split="train")
+            ```
+            Generate chain from loaded dataset:
+            ```py
+            from datasets import load_dataset
+            ds = load_dataset("beans", split="train")
+            DataChain.from_hf(ds)
+            ```
+        """
+        from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
+        output: dict[str, DataType] = {}
+        ds_dict = stream_splits(dataset, *args, **kwargs)
+        if len(ds_dict) > 1:
+            output = {"split": str}
+        model_name = model_name or object_name or ""
+        output = output | get_output_schema(next(iter(ds_dict.values())), model_name)
+        model = dict_to_data_model(model_name, output)
+        if object_name:
+            output = {object_name: model}
+        chain = DataChain.from_values(
+            split=list(ds_dict.keys()), session=session, settings=settings
+        )
+        return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
     def parse_tabular(
         self,
         output: OutputType = None,
@@ -1367,7 +1473,7 @@ class DataChain(DatasetQuery):
         if isinstance(output, dict):
             model_name = model_name or object_name or ""
-            model = DataChain._dict_to_data_model(model_name, output)
+            model = dict_to_data_model(model_name, output)
         else:
             model = output  # type: ignore[assignment]
@@ -1384,17 +1490,6 @@ class DataChain(DatasetQuery):
             ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
         )
-    @staticmethod
-    def _dict_to_data_model(
-        name: str, data_dict: dict[str, DataType]
-    ) -> type[BaseModel]:
-        fields = {name: (anno, ...) for name, anno in data_dict.items()}
-        return create_model(
-            name,
-            __base__=(DataModel,),  # type: ignore[call-overload]
-            **fields,
-        )  # type: ignore[call-overload]
     @classmethod
     def from_csv(
         cls,
@@ -1543,6 +1638,7 @@ class DataChain(DatasetQuery):
         cls,
         to_insert: Optional[Union[dict, list[dict]]],
         session: Optional[Session] = None,
+        settings: Optional[dict] = None,
         in_memory: bool = False,
         schema: Optional[dict[str, DataType]] = None,
     ) -> "DataChain":
@@ -1597,7 +1693,7 @@ class DataChain(DatasetQuery):
         insert_q = dr.get_table().insert()
         for record in to_insert:
             db.execute(insert_q.values(**record))
-        return DataChain(name=dsr.name)
+        return DataChain(name=dsr.name, settings=settings)
     def sum(self, fr: DataType):  # type: ignore[override]
         """Compute the sum of a column."""

datachain 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

Potentially problematic release.

datachain 0.3.7py3-none-any.whl → 0.3.9py3-none-any.whl