PyPI - datachain - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

datachain 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show

datachain/__init__.py +3 -4
datachain/cache.py +10 -4
datachain/catalog/catalog.py +35 -15
datachain/cli.py +37 -32
datachain/data_storage/metastore.py +24 -0
datachain/data_storage/warehouse.py +3 -1
datachain/job.py +56 -0
datachain/lib/arrow.py +19 -7
datachain/lib/clip.py +89 -66
datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
datachain/lib/convert/sql_to_python.py +23 -0
datachain/lib/convert/values_to_tuples.py +51 -33
datachain/lib/data_model.py +6 -27
datachain/lib/dataset_info.py +70 -0
datachain/lib/dc.py +646 -152
datachain/lib/file.py +117 -15
datachain/lib/image.py +1 -1
datachain/lib/meta_formats.py +14 -2
datachain/lib/model_store.py +3 -2
datachain/lib/pytorch.py +10 -7
datachain/lib/signal_schema.py +39 -14
datachain/lib/text.py +2 -1
datachain/lib/udf.py +56 -5
datachain/lib/udf_signature.py +1 -1
datachain/lib/webdataset.py +4 -3
datachain/node.py +11 -8
datachain/query/dataset.py +66 -147
datachain/query/dispatch.py +15 -13
datachain/query/schema.py +2 -0
datachain/query/session.py +4 -4
datachain/sql/functions/array.py +12 -0
datachain/sql/functions/string.py +8 -0
datachain/torch/__init__.py +1 -1
datachain/utils.py +45 -0
datachain-0.2.12.dist-info/METADATA +412 -0
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/RECORD +40 -45
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/WHEEL +1 -1
datachain/lib/feature_registry.py +0 -77
datachain/lib/gpt4_vision.py +0 -97
datachain/lib/hf_image_to_text.py +0 -97
datachain/lib/hf_pipeline.py +0 -90
datachain/lib/image_transform.py +0 -103
datachain/lib/iptc_exif_xmp.py +0 -76
datachain/lib/unstructured.py +0 -41
datachain/text/__init__.py +0 -3
datachain-0.2.10.dist-info/METADATA +0 -430
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/LICENSE +0 -0
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/entry_points.txt +0 -0
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/top_level.txt +0 -0

datachain/lib/webdataset.py CHANGED Viewed

@@ -13,8 +13,9 @@ from typing import (
     get_origin,
 )
-from pydantic import BaseModel, Field
+from pydantic import Field
+from datachain.lib.data_model import DataModel
 from datachain.lib.file import File, TarVFile
 from datachain.lib.utils import DataChainError
@@ -45,7 +46,7 @@ class UnknownFileExtensionError(WDSError):
         super().__init__(tar_stream, f"unknown extension '{ext}' for file '{name}'")
-class WDSBasic(BaseModel):
+class WDSBasic(DataModel):
     file: File
@@ -74,7 +75,7 @@ class WDSAllFile(WDSBasic):
     cbor: Optional[bytes] = Field(default=None)
-class WDSReadableSubclass(BaseModel):
+class WDSReadableSubclass(DataModel):
     @staticmethod
     def _reader(builder, item: tarfile.TarInfo) -> "WDSReadableSubclass":
         raise NotImplementedError

datachain/node.py CHANGED Viewed

@@ -5,7 +5,7 @@ import attrs
 from datachain.cache import UniqueId
 from datachain.storage import StorageURI
-from datachain.utils import time_to_str
+from datachain.utils import TIME_ZERO, time_to_str
 if TYPE_CHECKING:
     from typing_extensions import Self
@@ -111,13 +111,16 @@ class Node:
         if storage is None:
             storage = self.source
         return UniqueId(
-            storage,
-            self.parent,
-            self.name,
-            self.etag,
-            self.size,
-            self.vtype,
-            self.location,
+            storage=storage,
+            parent=self.parent,
+            name=self.name,
+            size=self.size,
+            version=self.version or "",
+            etag=self.etag,
+            is_latest=self.is_latest,
+            vtype=self.vtype,
+            location=self.location,
+            last_modified=self.last_modified or TIME_ZERO,
         )
     @classmethod

datachain/query/dataset.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import ast
 import contextlib
 import datetime
 import inspect
@@ -10,7 +9,6 @@ import re
 import string
 import subprocess
 import sys
-import types
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Iterable, Iterator, Sequence
 from copy import copy
@@ -26,12 +24,9 @@ from typing import (
 )
 import attrs
-import pandas as pd
 import sqlalchemy
 from attrs import frozen
-from dill import dumps, source
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
-from pydantic import BaseModel
 from sqlalchemy import Column
 from sqlalchemy.sql import func as f
 from sqlalchemy.sql.elements import ColumnClause, ColumnElement
@@ -53,10 +48,14 @@ from datachain.data_storage.schema import (
 from datachain.dataset import DatasetStatus, RowDict
 from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.progress import CombinedDownloadCallback
-from datachain.query.schema import DEFAULT_DELIMITER
 from datachain.sql.functions import rand
 from datachain.storage import Storage, StorageURI
-from datachain.utils import batched, determine_processes, inside_notebook
+from datachain.utils import (
+    batched,
+    determine_processes,
+    filtered_cloudpickle_dumps,
+    get_datachain_executable,
+)
 from .metrics import metrics
 from .schema import C, UDFParamSpec, normalize_param
@@ -428,7 +427,7 @@ def get_generated_callback(is_generator: bool = False) -> Callback:
 @frozen
-class UDF(Step, ABC):
+class UDFStep(Step, ABC):
     udf: UDFType
     catalog: "Catalog"
     partition_by: Optional[PartitionByType] = None
@@ -492,7 +491,7 @@ class UDF(Step, ABC):
             elif processes:
                 # Parallel processing (faster for more CPU-heavy UDFs)
                 udf_info = {
-                    "udf": self.udf,
+                    "udf_data": filtered_cloudpickle_dumps(self.udf),
                     "catalog_init": self.catalog.get_init_params(),
                     "id_generator_clone_params": (
                         self.catalog.id_generator.clone_params()
@@ -509,20 +508,18 @@ class UDF(Step, ABC):
                 # Run the UDFDispatcher in another process to avoid needing
                 # if __name__ == '__main__': in user scripts
-                datachain_exec_path = os.environ.get("DATACHAIN_EXEC_PATH", "datachain")
+                exec_cmd = get_datachain_executable()
                 envs = dict(os.environ)
                 envs.update({"PYTHONPATH": os.getcwd()})
-                with self.process_feature_module():
-                    process_data = dumps(udf_info, recurse=True)
-                    result = subprocess.run(  # noqa: S603
-                        [datachain_exec_path, "--internal-run-udf"],
-                        input=process_data,
-                        check=False,
-                        env=envs,
-                    )
-                    if result.returncode != 0:
-                        raise RuntimeError("UDF Execution Failed!")
+                process_data = filtered_cloudpickle_dumps(udf_info)
+                result = subprocess.run(  # noqa: S603
+                    [*exec_cmd, "internal-run-udf"],
+                    input=process_data,
+                    check=False,
+                    env=envs,
+                )
+                if result.returncode != 0:
+                    raise RuntimeError("UDF Execution Failed!")
             else:
                 # Otherwise process single-threaded (faster for smaller UDFs)
@@ -571,57 +568,6 @@ class UDF(Step, ABC):
             self.catalog.warehouse.close()
             raise
-    @contextlib.contextmanager
-    def process_feature_module(self):
-        # Generate a random name for the feature module
-        feature_module_name = "tmp" + _random_string(10)
-        # Create a dynamic module with the generated name
-        dynamic_module = types.ModuleType(feature_module_name)
-        # Get the import lines for the necessary objects from the main module
-        main_module = sys.modules["__main__"]
-        if getattr(main_module, "__file__", None):
-            import_lines = list(get_imports(main_module))
-        else:
-            import_lines = [
-                source.getimport(obj, alias=name)
-                for name, obj in main_module.__dict__.items()
-                if _imports(obj) and not (name.startswith("__") and name.endswith("__"))
-            ]
-        # Get the feature classes from the main module
-        feature_classes = {
-            name: obj
-            for name, obj in main_module.__dict__.items()
-            if _feature_predicate(obj)
-        }
-        if not feature_classes:
-            yield None
-            return
-        # Get the source code of the feature classes
-        feature_sources = [source.getsource(cls) for _, cls in feature_classes.items()]
-        # Set the module name for the feature classes to the generated name
-        for name, cls in feature_classes.items():
-            cls.__module__ = feature_module_name
-            setattr(dynamic_module, name, cls)
-        # Add the dynamic module to the sys.modules dictionary
-        sys.modules[feature_module_name] = dynamic_module
-        # Combine the import lines and feature sources
-        feature_file = "\n".join(import_lines) + "\n" + "\n".join(feature_sources)
-        # Write the module content to a .py file
-        with open(f"{feature_module_name}.py", "w") as module_file:
-            module_file.write(feature_file)
-        try:
-            yield feature_module_name
-        finally:
-            for cls in feature_classes.values():
-                cls.__module__ = main_module.__name__
-            os.unlink(f"{feature_module_name}.py")
-            # Remove the dynamic module from sys.modules
-            del sys.modules[feature_module_name]
     def create_partitions_table(self, query: Select) -> "Table":
         """
         Create temporary table with group by partitions.
@@ -689,7 +635,7 @@ class UDF(Step, ABC):
 @frozen
-class UDFSignal(UDF):
+class UDFSignal(UDFStep):
     is_generator = False
     def create_udf_table(self, query: Select) -> "Table":
@@ -784,7 +730,7 @@ class UDFSignal(UDF):
 @frozen
-class RowGenerator(UDF):
+class RowGenerator(UDFStep):
     """Extend dataset with new rows."""
     is_generator = True
@@ -919,6 +865,18 @@ class SQLCount(SQLClause):
         return sqlalchemy.select(f.count(1)).select_from(query.subquery())
+@frozen
+class SQLDistinct(SQLClause):
+    args: tuple[ColumnElement, ...]
+    dialect: str
+    def apply_sql_clause(self, query):
+        if self.dialect == "sqlite":
+            return query.group_by(*self.args)
+        return query.distinct(*self.args)
 @frozen
 class SQLUnion(Step):
     query1: "DatasetQuery"
@@ -1000,12 +958,15 @@ class SQLJoin(Step):
         q1_columns = list(q1.c)
         q1_column_names = {c.name for c in q1_columns}
-        q2_columns = [
-            c
-            if c.name not in q1_column_names and c.name != "sys__id"
-            else c.label(self.rname.format(name=c.name))
-            for c in q2.c
-        ]
+        q2_columns = []
+        for c in q2.c:
+            if c.name.startswith("sys__"):
+                continue
+            if c.name in q1_column_names:
+                c = c.label(self.rname.format(name=c.name))
+            q2_columns.append(c)
         res_columns = q1_columns + q2_columns
         predicates = (
@@ -1112,6 +1073,7 @@ class DatasetQuery:
         anon: bool = False,
         indexing_feature_schema: Optional[dict] = None,
         indexing_column_types: Optional[dict[str, Any]] = None,
+        update: Optional[bool] = False,
     ):
         if client_config is None:
             client_config = {}
@@ -1134,10 +1096,12 @@ class DatasetQuery:
         self.session = Session.get(session, catalog=catalog)
         if path:
-            self.starting_step = IndexingStep(path, self.catalog, {}, recursive)
+            kwargs = {"update": True} if update else {}
+            self.starting_step = IndexingStep(path, self.catalog, kwargs, recursive)
             self.feature_schema = indexing_feature_schema
             self.column_types = indexing_column_types
         elif name:
+            self.name = name
             ds = self.catalog.get_dataset(name)
             self.version = version or ds.latest_version
             self.feature_schema = ds.get_version(self.version).feature_schema
@@ -1145,9 +1109,6 @@ class DatasetQuery:
             if "sys__id" in self.column_types:
                 self.column_types.pop("sys__id")
             self.starting_step = QueryStep(self.catalog, name, self.version)
-            # attaching to specific dataset
-            self.name = name
-            self.version = version
         else:
             raise ValueError("must provide path or name")
@@ -1156,7 +1117,7 @@ class DatasetQuery:
         return bool(re.compile(r"^[a-zA-Z0-9]+://").match(path))
     def __iter__(self):
-        return iter(self.results())
+        return iter(self.db_results())
     def __or__(self, other):
         return self.union(other)
@@ -1277,13 +1238,16 @@ class DatasetQuery:
         warehouse.close()
         self.temp_table_names = []
-    def results(self, row_factory=None, **kwargs):
+    def db_results(self, row_factory=None, **kwargs):
         with self.as_iterable(**kwargs) as result:
             if row_factory:
                 cols = result.columns
                 return [row_factory(cols, r) for r in result]
             return list(result)
+    def to_db_records(self) -> list[dict[str, Any]]:
+        return self.db_results(lambda cols, row: dict(zip(cols, row)))
     @contextlib.contextmanager
     def as_iterable(self, **kwargs) -> Iterator[ResultIter]:
         try:
@@ -1343,15 +1307,6 @@ class DatasetQuery:
         finally:
             self.cleanup()
-    def to_records(self) -> list[dict[str, Any]]:
-        return self.results(lambda cols, row: dict(zip(cols, row)))
-    def to_pandas(self) -> "pd.DataFrame":
-        records = self.to_records()
-        df = pd.DataFrame.from_records(records)
-        df.columns = [c.replace(DEFAULT_DELIMITER, ".") for c in df.columns]
-        return df
     def shuffle(self) -> "Self":
         # ToDo: implement shaffle based on seed and/or generating random column
         return self.order_by(C.sys__rand)
@@ -1370,22 +1325,6 @@ class DatasetQuery:
         return sampled.limit(n)
-    def show(self, limit=20) -> None:
-        df = self.limit(limit).to_pandas()
-        options = ["display.max_colwidth", 50, "display.show_dimensions", False]
-        with pd.option_context(*options):
-            if inside_notebook():
-                from IPython.display import display
-                display(df)
-            else:
-                print(df.to_string())
-        if len(df) == limit:
-            print(f"[limited by {limit} objects]")
     def clone(self, new_table=True) -> "Self":
         obj = copy(self)
         obj.steps = obj.steps.copy()
@@ -1483,6 +1422,14 @@ class DatasetQuery:
         query.steps.append(SQLOffset(offset))
         return query
+    @detach
+    def distinct(self, *args) -> "Self":
+        query = self.clone()
+        query.steps.append(
+            SQLDistinct(args, dialect=self.catalog.warehouse.db.dialect.name)
+        )
+        return query
     def as_scalar(self) -> Any:
         with self.as_iterable() as rows:
             row = next(iter(rows))
@@ -1781,10 +1728,13 @@ def _send_result(dataset_query: DatasetQuery) -> None:
     columns = preview_args.get("columns") or []
-    preview_query = (
-        dataset_query.select(*columns)
-        .limit(preview_args.get("limit", 10))
-        .offset(preview_args.get("offset", 0))
+    if type(dataset_query) is DatasetQuery:
+        preview_query = dataset_query.select(*columns)
+    else:
+        preview_query = dataset_query.select(*columns, _sys=False)
+    preview_query = preview_query.limit(preview_args.get("limit", 10)).offset(
+        preview_args.get("offset", 0)
     )
     dataset: Optional[tuple[str, int]] = None
@@ -1793,7 +1743,7 @@ def _send_result(dataset_query: DatasetQuery) -> None:
         assert dataset_query.version, "Dataset version should be provided"
         dataset = dataset_query.name, dataset_query.version
-    preview = preview_query.to_records()
+    preview = preview_query.to_db_records()
     result = ExecutionResult(preview, dataset, metrics)
     data = attrs.asdict(result)
@@ -1853,34 +1803,3 @@ def _random_string(length: int) -> str:
         random.choice(string.ascii_letters + string.digits)  # noqa: S311
         for i in range(length)
     )
-def _feature_predicate(obj):
-    return (
-        inspect.isclass(obj) and source.isfrommain(obj) and issubclass(obj, BaseModel)
-    )
-def _imports(obj):
-    return not source.isfrommain(obj)
-def get_imports(m):
-    root = ast.parse(inspect.getsource(m))
-    for node in ast.iter_child_nodes(root):
-        if isinstance(node, ast.Import):
-            module = None
-        elif isinstance(node, ast.ImportFrom):
-            module = node.module
-        else:
-            continue
-        for n in node.names:
-            import_script = ""
-            if module:
-                import_script += f"from {module} "
-            import_script += f"import {n.name}"
-            if n.asname:
-                import_script += f" as {n.asname}"
-            yield import_script

datachain/query/dispatch.py CHANGED Viewed

@@ -10,7 +10,7 @@ from typing import Any, Optional
 import attrs
 import multiprocess
-from dill import load
+from cloudpickle import load, loads
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from multiprocess import get_context
@@ -84,7 +84,7 @@ def put_into_queue(queue: Queue, item: Any) -> None:
 def udf_entrypoint() -> int:
     # Load UDF info from stdin
-    udf_info = load(stdin.buffer)  # noqa: S301
+    udf_info = load(stdin.buffer)
     (
         warehouse_class,
@@ -95,7 +95,7 @@ def udf_entrypoint() -> int:
     # Parallel processing (faster for more CPU-heavy UDFs)
     dispatch = UDFDispatcher(
-        udf_info["udf"],
+        udf_info["udf_data"],
         udf_info["catalog_init"],
         udf_info["id_generator_clone_params"],
         udf_info["metastore_clone_params"],
@@ -108,7 +108,7 @@ def udf_entrypoint() -> int:
     batching = udf_info["batching"]
     table = udf_info["table"]
     n_workers = udf_info["processes"]
-    udf = udf_info["udf"]
+    udf = loads(udf_info["udf_data"])
     if n_workers is True:
         # Use default number of CPUs (cores)
         n_workers = None
@@ -146,7 +146,7 @@ class UDFDispatcher:
     def __init__(
         self,
-        udf,
+        udf_data,
         catalog_init_params,
         id_generator_clone_params,
         metastore_clone_params,
@@ -155,14 +155,7 @@ class UDFDispatcher:
         is_generator=False,
         buffer_size=DEFAULT_BATCH_SIZE,
     ):
-        # isinstance cannot be used here, as dill packages the entire class definition,
-        # and so these two types are not considered exactly equal,
-        # even if they have the same import path.
-        if full_module_type_path(type(udf)) != full_module_type_path(UDFFactory):
-            self.udf = udf
-        else:
-            self.udf = None
-            self.udf_factory = udf
+        self.udf_data = udf_data
         self.catalog_init_params = catalog_init_params
         (
             self.id_generator_class,
@@ -214,6 +207,15 @@ class UDFDispatcher:
             self.catalog = Catalog(
                 id_generator, metastore, warehouse, **self.catalog_init_params
             )
+        udf = loads(self.udf_data)
+        # isinstance cannot be used here, as cloudpickle packages the entire class
+        # definition, and so these two types are not considered exactly equal,
+        # even if they have the same import path.
+        if full_module_type_path(type(udf)) != full_module_type_path(UDFFactory):
+            self.udf = udf
+        else:
+            self.udf = None
+            self.udf_factory = udf
         if not self.udf:
             self.udf = self.udf_factory()

datachain/query/schema.py CHANGED Viewed

@@ -32,6 +32,7 @@ class Column(sa.ColumnClause, metaclass=ColumnMeta):
     inherit_cache: Optional[bool] = True
     def __init__(self, text, type_=None, is_literal=False, _selectable=None):
+        """Dataset column."""
         self.name = ColumnMeta.to_db_name(text)
         super().__init__(
             self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
@@ -41,6 +42,7 @@ class Column(sa.ColumnClause, metaclass=ColumnMeta):
         return Column(self.name + DEFAULT_DELIMITER + name)
     def glob(self, glob_str):
+        """Search for matches using glob pattern matching."""
         return self.op("GLOB")(glob_str)

datachain/query/session.py CHANGED Viewed

@@ -28,9 +28,9 @@ class Session:
     Parameters:
-    `name` (str): The name of the session. Only latters and numbers are supported.
+    name (str): The name of the session. Only latters and numbers are supported.
            It can be empty.
-    `catalog` (Catalog): Catalog object.
+    catalog (Catalog): Catalog object.
     """
     GLOBAL_SESSION_CTX: Optional["Session"] = None
@@ -80,9 +80,9 @@ class Session:
         """Creates a Session() object from a catalog.
         Parameters:
-            `session` (Session): Optional Session(). If not provided a new session will
+            session (Session): Optional Session(). If not provided a new session will
                     be created. It's needed mostly for simplie API purposes.
-            `catalog` (Catalog): Optional catalog. By default a new catalog is created.
+            catalog (Catalog): Optional catalog. By default a new catalog is created.
         """
         if session:
             return session

datachain/sql/functions/array.py CHANGED Viewed

@@ -5,6 +5,10 @@ from datachain.sql.utils import compiler_not_implemented
 class cosine_distance(GenericFunction):  # noqa: N801
+    """
+    Takes a column and array and returns the cosine distance between them.
+    """
     type = Float()
     package = "array"
     name = "cosine_distance"
@@ -12,6 +16,10 @@ class cosine_distance(GenericFunction):  # noqa: N801
 class euclidean_distance(GenericFunction):  # noqa: N801
+    """
+    Takes a column and array and returns the Euclidean distance between them.
+    """
     type = Float()
     package = "array"
     name = "euclidean_distance"
@@ -19,6 +27,10 @@ class euclidean_distance(GenericFunction):  # noqa: N801
 class length(GenericFunction):  # noqa: N801
+    """
+    Returns the length of the array.
+    """
     type = Int64()
     package = "array"
     name = "length"

datachain/sql/functions/string.py CHANGED Viewed

@@ -5,6 +5,10 @@ from datachain.sql.utils import compiler_not_implemented
 class length(GenericFunction):  # noqa: N801
+    """
+    Returns the length of the string.
+    """
     type = Int64()
     package = "string"
     name = "length"
@@ -12,6 +16,10 @@ class length(GenericFunction):  # noqa: N801
 class split(GenericFunction):  # noqa: N801
+    """
+    Takes a column and split character and returns an array of the parts.
+    """
     type = Array(String())
     package = "string"
     name = "split"

datachain/torch/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 try:
-    from datachain.lib.clip import similarity_scores as clip_similarity_scores
+    from datachain.lib.clip import clip_similarity_scores
     from datachain.lib.image import convert_image, convert_images
     from datachain.lib.pytorch import PytorchDataset, label_to_int
     from datachain.lib.text import convert_text

datachain/utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import glob
 import importlib.util
+import io
 import json
 import os
 import os.path as osp
@@ -13,8 +14,10 @@ from itertools import islice
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from uuid import UUID
+import cloudpickle
 from dateutil import tz
 from dateutil.parser import isoparse
+from pydantic import BaseModel
 if TYPE_CHECKING:
     import pandas as pd
@@ -388,3 +391,45 @@ def inside_notebook() -> bool:
             return False
     return False
+def get_all_subclasses(cls):
+    """Return all subclasses of a given class.
+    Can return duplicates due to multiple inheritance."""
+    for subclass in cls.__subclasses__():
+        yield from get_all_subclasses(subclass)
+        yield subclass
+def filtered_cloudpickle_dumps(obj: Any) -> bytes:
+    """Equivalent to cloudpickle.dumps, but this supports Pydantic models."""
+    model_namespaces = {}
+    with io.BytesIO() as f:
+        pickler = cloudpickle.CloudPickler(f)
+        for model_class in get_all_subclasses(BaseModel):
+            # This "is not None" check is needed, because due to multiple inheritance,
+            # it is theoretically possible to get the same class twice from
+            # get_all_subclasses.
+            if model_class.__pydantic_parent_namespace__ is not None:
+                # __pydantic_parent_namespace__ can contain many unnecessary and
+                # unpickleable entities, so should be removed for serialization.
+                model_namespaces[model_class] = (
+                    model_class.__pydantic_parent_namespace__
+                )
+                model_class.__pydantic_parent_namespace__ = None
+        try:
+            pickler.dump(obj)
+            return f.getvalue()
+        finally:
+            for model_class, namespace in model_namespaces.items():
+                # Restore original __pydantic_parent_namespace__ locally.
+                model_class.__pydantic_parent_namespace__ = namespace
+def get_datachain_executable() -> list[str]:
+    if datachain_exec_path := os.getenv("DATACHAIN_EXEC_PATH"):
+        return [datachain_exec_path]
+    return [sys.executable, "-m", "datachain"]

datachain 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

Potentially problematic release.

datachain 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl