PyPI - datachain - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl - Mend

datachain 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (20) hide show

datachain/catalog/catalog.py +50 -230
datachain/error.py +0 -4
datachain/job.py +4 -3
datachain/lib/clip.py +1 -1
datachain/lib/dc.py +92 -38
datachain/lib/file.py +9 -8
datachain/lib/image.py +1 -1
datachain/lib/meta_formats.py +38 -59
datachain/lib/model_store.py +6 -1
datachain/lib/text.py +1 -1
datachain/lib/webdataset.py +13 -0
datachain/lib/webdataset_laion.py +13 -0
datachain/query/dataset.py +9 -32
{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/METADATA +7 -5
{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/RECORD +19 -20
datachain/catalog/subclass.py +0 -60
{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/LICENSE +0 -0
{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/WHEEL +0 -0
{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/entry_points.txt +0 -0
{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -9,11 +9,9 @@ import os.path
 import posixpath
 import subprocess
 import sys
-import tempfile
 import time
 import traceback
 from collections.abc import Iterable, Iterator, Mapping, Sequence
-from contextlib import contextmanager, nullcontext
 from copy import copy
 from dataclasses import dataclass
 from functools import cached_property, reduce
@@ -24,7 +22,6 @@ from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
-    NamedTuple,
     NoReturn,
     Optional,
     Union,
@@ -59,7 +56,6 @@ from datachain.error import (
     PendingIndexingError,
     QueryScriptCancelError,
     QueryScriptCompileError,
-    QueryScriptDatasetNotFound,
     QueryScriptRunError,
 )
 from datachain.listing import Listing
@@ -77,7 +73,6 @@ from datachain.utils import (
 )
 from .datasource import DataSource
-from .subclass import SubclassFinder
 if TYPE_CHECKING:
     from datachain.data_storage import (
@@ -92,7 +87,6 @@ logger = logging.getLogger("datachain")
 DEFAULT_DATASET_DIR = "dataset"
 DATASET_FILE_SUFFIX = ".edatachain"
-FEATURE_CLASSES = ["DataModel"]
 TTL_INT = 4 * 60 * 60
@@ -118,44 +112,19 @@ def noop(_: str):
     pass
-@contextmanager
-def print_and_capture(
-    stream: "IO[bytes]|IO[str]", callback: Callable[[str], None] = noop
-) -> "Iterator[list[str]]":
-    lines: list[str] = []
-    append = lines.append
+def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
+    buffer = b""
+    while byt := stream.read(1):  # Read one byte at a time
+        buffer += byt
-    def loop() -> None:
-        buffer = b""
-        while byt := stream.read(1):  # Read one byte at a time
-            buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
-            if byt in (b"\n", b"\r"):  # Check for newline or carriage return
-                line = buffer.decode("utf-8")
-                print(line, end="")
-                callback(line)
-                append(line)
-                buffer = b""  # Clear buffer for next line
-        if buffer:  # Handle any remaining data in the buffer
+        if byt in (b"\n", b"\r"):  # Check for newline or carriage return
             line = buffer.decode("utf-8")
-            print(line, end="")
             callback(line)
-            append(line)
-    thread = Thread(target=loop, daemon=True)
-    thread.start()
-    try:
-        yield lines
-    finally:
-        thread.join()
+            buffer = b""  # Clear buffer for next line
-class QueryResult(NamedTuple):
-    dataset: Optional[DatasetRecord]
-    version: Optional[int]
-    output: str
+    if buffer:  # Handle any remaining data in the buffer
+        line = buffer.decode("utf-8")
+        callback(line)
 class DatasetRowsFetcher(NodesThreadPool):
@@ -569,12 +538,6 @@ def find_column_to_str(  # noqa: PLR0911
     return ""
-def form_module_source(source_ast):
-    module = ast.Module(body=source_ast, type_ignores=[])
-    module = ast.fix_missing_locations(module)
-    return ast.unparse(module)
 class Catalog:
     def __init__(
         self,
@@ -658,34 +621,8 @@ class Catalog:
                     ),
                 ]
                 code_ast.body[-1:] = new_expressions
-            else:
-                raise Exception("Last line in a script was not an expression")
         return code_ast
-    def compile_query_script(
-        self, script: str, feature_module_name: str
-    ) -> tuple[Union[str, None], str]:
-        code_ast = ast.parse(script)
-        code_ast = self.attach_query_wrapper(code_ast)
-        finder = SubclassFinder(FEATURE_CLASSES)
-        finder.visit(code_ast)
-        if not finder.feature_class:
-            main_module = form_module_source([*finder.imports, *finder.main_body])
-            return None, main_module
-        feature_import = ast.ImportFrom(
-            module=feature_module_name,
-            names=[ast.alias(name="*", asname=None)],
-            level=0,
-        )
-        feature_module = form_module_source([*finder.imports, *finder.feature_class])
-        main_module = form_module_source(
-            [*finder.imports, feature_import, *finder.main_body]
-        )
-        return feature_module, main_module
     def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
         config = config or self.client_config
         return Client.parse_url(uri, self.cache, **config)
@@ -1416,7 +1353,8 @@ class Catalog:
         for d in datasets:
             yield from (
-                (d, v, jobs.get(v.job_id) if v.job_id else None) for v in d.versions
+                (d, v, jobs.get(str(v.job_id)) if v.job_id else None)
+                for v in d.versions
             )
     def ls_dataset_rows(
@@ -1834,14 +1772,15 @@ class Catalog:
     def query(
         self,
         query_script: str,
-        envs: Optional[Mapping[str, str]] = None,
-        python_executable: Optional[str] = None,
+        env: Optional[Mapping[str, str]] = None,
+        python_executable: str = sys.executable,
         save: bool = False,
         capture_output: bool = True,
         output_hook: Callable[[str], None] = noop,
         params: Optional[dict[str, str]] = None,
         job_id: Optional[str] = None,
-    ) -> QueryResult:
+        _execute_last_expression: bool = False,
+    ) -> None:
         """
         Method to run custom user Python script to run a query and, as result,
         creates new dataset from the results of a query.
@@ -1864,170 +1803,51 @@ class Catalog:
                 C.size > 1000
             )
         """
-        feature_file = tempfile.NamedTemporaryFile(  # noqa: SIM115
-            dir=os.getcwd(), suffix=".py", delete=False
-        )
-        _, feature_module = os.path.split(feature_file.name)
-        try:
-            lines, proc, response_text = self.run_query(
-                python_executable or sys.executable,
-                query_script,
-                envs,
-                feature_file,
-                capture_output,
-                feature_module,
-                output_hook,
-                params,
-                save,
-                job_id,
-            )
-        finally:
-            feature_file.close()
-            os.unlink(feature_file.name)
-        output = "".join(lines)
-        if proc.returncode:
-            if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
-                raise QueryScriptCancelError(
-                    "Query script was canceled by user",
-                    return_code=proc.returncode,
-                    output=output,
-                )
-            if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
-                raise QueryScriptRunError(
-                    "Last line in a script was not an instance of DataChain",
-                    return_code=proc.returncode,
-                    output=output,
-                )
-            raise QueryScriptRunError(
-                f"Query script exited with error code {proc.returncode}",
-                return_code=proc.returncode,
-                output=output,
-            )
-        try:
-            result = json.loads(response_text)
-        except ValueError:
-            result = None
-        dataset: Optional[DatasetRecord] = None
-        version: Optional[int] = None
-        if save:
-            dataset, version = self.save_result(
-                query_script, result, output, version, job_id
-            )
-        return QueryResult(dataset=dataset, version=version, output=output)
-    def run_query(
-        self,
-        python_executable: str,
-        query_script: str,
-        envs: Optional[Mapping[str, str]],
-        feature_file: IO[bytes],
-        capture_output: bool,
-        feature_module: str,
-        output_hook: Callable[[str], None],
-        params: Optional[dict[str, str]],
-        save: bool,
-        job_id: Optional[str],
-    ) -> tuple[list[str], subprocess.Popen, str]:
-        try:
-            feature_code, query_script_compiled = self.compile_query_script(
-                query_script, feature_module[:-3]
-            )
-            if feature_code:
-                feature_file.write(feature_code.encode())
-                feature_file.flush()
-        except Exception as exc:
-            raise QueryScriptCompileError(
-                f"Query script failed to compile, reason: {exc}"
-            ) from exc
-        r, w = os.pipe()
-        if os.name == "nt":
-            import msvcrt
-            os.set_inheritable(w, True)
-            startupinfo = subprocess.STARTUPINFO()  # type: ignore[attr-defined]
-            handle = msvcrt.get_osfhandle(w)  # type: ignore[attr-defined]
-            startupinfo.lpAttributeList["handle_list"].append(handle)
-            kwargs: dict[str, Any] = {"startupinfo": startupinfo}
+        if _execute_last_expression:
+            try:
+                code_ast = ast.parse(query_script)
+                code_ast = self.attach_query_wrapper(code_ast)
+                query_script_compiled = ast.unparse(code_ast)
+            except Exception as exc:
+                raise QueryScriptCompileError(
+                    f"Query script failed to compile, reason: {exc}"
+                ) from exc
         else:
-            handle = w
-            kwargs = {"pass_fds": [w]}
-        envs = dict(envs or os.environ)
-        if feature_code:
-            envs["DATACHAIN_FEATURE_CLASS_SOURCE"] = json.dumps(
-                {feature_module: feature_code}
-            )
-        envs.update(
+            query_script_compiled = query_script
+            assert not save
+        env = dict(env or os.environ)
+        env.update(
             {
                 "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
                 "PYTHONPATH": os.getcwd(),  # For local imports
                 "DATACHAIN_QUERY_SAVE": "1" if save else "",
                 "PYTHONUNBUFFERED": "1",
-                "DATACHAIN_OUTPUT_FD": str(handle),
                 "DATACHAIN_JOB_ID": job_id or "",
             },
         )
-        with subprocess.Popen(  # noqa: S603
-            [python_executable, "-c", query_script_compiled],
-            env=envs,
-            stdout=subprocess.PIPE if capture_output else None,
-            stderr=subprocess.STDOUT if capture_output else None,
-            bufsize=1,
-            text=False,
-            **kwargs,
-        ) as proc:
-            os.close(w)
-            out = proc.stdout
-            _lines: list[str] = []
-            ctx = print_and_capture(out, output_hook) if out else nullcontext(_lines)
-            with ctx as lines, open(r) as f:
-                response_text = ""
-                while proc.poll() is None:
-                    response_text += f.readline()
-                    time.sleep(0.1)
-                response_text += f.readline()
-        return lines, proc, response_text
-    def save_result(self, query_script, exec_result, output, version, job_id):
-        if not exec_result:
-            raise QueryScriptDatasetNotFound(
-                "No dataset found after running Query script",
-                output=output,
+        popen_kwargs = {}
+        if capture_output:
+            popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
+        cmd = [python_executable, "-c", query_script_compiled]
+        with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc:  # type: ignore[call-overload]  # noqa: S603
+            if capture_output:
+                args = (proc.stdout, output_hook)
+                thread = Thread(target=_process_stream, args=args, daemon=True)
+                thread.start()
+                thread.join()  # wait for the reader thread
+        if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
+            raise QueryScriptCancelError(
+                "Query script was canceled by user",
+                return_code=proc.returncode,
+            )
+        if proc.returncode:
+            raise QueryScriptRunError(
+                f"Query script exited with error code {proc.returncode}",
+                return_code=proc.returncode,
             )
-        name, version = exec_result
-        # finding returning dataset
-        try:
-            dataset = self.get_dataset(name)
-            dataset.get_version(version)
-        except (DatasetNotFoundError, ValueError) as e:
-            raise QueryScriptDatasetNotFound(
-                "No dataset found after running Query script",
-                output=output,
-            ) from e
-        dataset = self.update_dataset(
-            dataset,
-            script_output=output,
-            query_script=query_script,
-        )
-        self.update_dataset_version_with_warehouse_info(
-            dataset,
-            version,
-            script_output=output,
-            query_script=query_script,
-            job_id=job_id,
-            is_job_result=True,
-        )
-        return dataset, version
     def cp(
         self,

datachain/error.py CHANGED Viewed

@@ -42,10 +42,6 @@ class QueryScriptRunError(Exception):
         super().__init__(self.message)
-class QueryScriptDatasetNotFound(QueryScriptRunError):  # noqa: N818
-    pass
 class QueryScriptCancelError(QueryScriptRunError):
     pass

datachain/job.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
+import uuid
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Optional, TypeVar
+from typing import Any, Optional, TypeVar, Union
 J = TypeVar("J", bound="Job")
@@ -25,7 +26,7 @@ class Job:
     @classmethod
     def parse(
         cls: type[J],
-        id: str,
+        id: Union[str, uuid.UUID],
         name: str,
         status: int,
         created_at: datetime,
@@ -40,7 +41,7 @@ class Job:
         metrics: str,
     ) -> "Job":
         return cls(
-            id,
+            str(id),
             name,
             status,
             created_at,

datachain/lib/clip.py CHANGED Viewed

@@ -18,7 +18,7 @@ def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
         hasattr(model, method_name) and inspect.ismethod(getattr(model, method_name))
     ):
         method = getattr(model, method_name)
-        return lambda x: method(torch.tensor(x))
+        return lambda x: method(torch.as_tensor(x).clone().detach())
     # Check for model from clip or open_clip library
     method_name = f"encode_{type}"

datachain/lib/dc.py CHANGED Viewed

@@ -56,7 +56,7 @@ from datachain.query.dataset import (
     PartitionByType,
     detach,
 )
-from datachain.query.schema import Column, DatasetRow
+from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
 from datachain.sql.functions import path as pathfunc
 from datachain.utils import inside_notebook
@@ -112,11 +112,31 @@ class DatasetFromValuesError(DataChainParamsError):  # noqa: D101
         super().__init__(f"Dataset{name} from values error: {msg}")
+def _get_merge_error_str(col: Union[str, sqlalchemy.ColumnElement]) -> str:
+    if isinstance(col, str):
+        return col
+    if isinstance(col, sqlalchemy.Column):
+        return col.name.replace(DEFAULT_DELIMITER, ".")
+    if isinstance(col, sqlalchemy.ColumnElement) and hasattr(col, "name"):
+        return f"{col.name} expression"
+    return str(col)
 class DatasetMergeError(DataChainParamsError):  # noqa: D101
-    def __init__(self, on: Sequence[str], right_on: Optional[Sequence[str]], msg: str):  # noqa: D107
-        on_str = ", ".join(on) if isinstance(on, Sequence) else ""
+    def __init__(  # noqa: D107
+        self,
+        on: Sequence[Union[str, sqlalchemy.ColumnElement]],
+        right_on: Optional[Sequence[Union[str, sqlalchemy.ColumnElement]]],
+        msg: str,
+    ):
+        def _get_str(on: Sequence[Union[str, sqlalchemy.ColumnElement]]) -> str:
+            if not isinstance(on, Sequence):
+                return str(on)  # type: ignore[unreachable]
+            return ", ".join([_get_merge_error_str(col) for col in on])
+        on_str = _get_str(on)
         right_on_str = (
-            ", right_on='" + ", ".join(right_on) + "'"
+            ", right_on='" + _get_str(right_on) + "'"
             if right_on and isinstance(right_on, Sequence)
             else ""
         )
@@ -139,7 +159,7 @@ class Sys(DataModel):
 class DataChain(DatasetQuery):
-    """AI 🔗 DataChain - a data structure for batch data processing and evaluation.
+    """DataChain - a data structure for batch data processing and evaluation.
     It represents a sequence of data manipulation steps such as reading data from
     storages, running AI or LLM models or calling external services API to validate or
@@ -252,13 +272,24 @@ class DataChain(DatasetQuery):
         """Returns Column instance with a type if name is found in current schema,
         otherwise raises an exception.
         """
-        name_path = name.split(".")
+        if "." in name:
+            name_path = name.split(".")
+        elif DEFAULT_DELIMITER in name:
+            name_path = name.split(DEFAULT_DELIMITER)
+        else:
+            name_path = [name]
         for path, type_, _, _ in self.signals_schema.get_flat_tree():
             if path == name_path:
                 return Column(name, python_to_sql(type_))
         raise ValueError(f"Column with name {name} not found in the schema")
+    def c(self, column: Union[str, Column]) -> Column:
+        """Returns Column instance attached to the current chain."""
+        c = self.column(column) if isinstance(column, str) else self.column(column.name)
+        c.table = self.table
+        return c
     def print_schema(self) -> None:
         """Print schema of the chain."""
         self._effective_signals_schema.print_tree()
@@ -384,7 +415,7 @@ class DataChain(DatasetQuery):
                 .save(list_dataset_name, listing=True)
             )
-        dc = cls.from_dataset(list_dataset_name, session=session)
+        dc = cls.from_dataset(list_dataset_name, session=session, settings=settings)
         dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
         return ls(dc, list_path, recursive=recursive, object_name=object_name)
@@ -395,6 +426,7 @@ class DataChain(DatasetQuery):
         name: str,
         version: Optional[int] = None,
         session: Optional[Session] = None,
+        settings: Optional[dict] = None,
     ) -> "DataChain":
         """Get data from a saved Dataset. It returns the chain itself.
@@ -407,7 +439,7 @@ class DataChain(DatasetQuery):
             chain = DataChain.from_dataset("my_cats")
             ```
         """
-        return DataChain(name=name, version=version, session=session)
+        return DataChain(name=name, version=version, session=session, settings=settings)
     @classmethod
     def from_json(
@@ -1140,8 +1172,17 @@ class DataChain(DatasetQuery):
     def merge(
         self,
         right_ds: "DataChain",
-        on: Union[str, Sequence[str]],
-        right_on: Union[str, Sequence[str], None] = None,
+        on: Union[
+            str,
+            sqlalchemy.ColumnElement,
+            Sequence[Union[str, sqlalchemy.ColumnElement]],
+        ],
+        right_on: Union[
+            str,
+            sqlalchemy.ColumnElement,
+            Sequence[Union[str, sqlalchemy.ColumnElement]],
+            None,
+        ] = None,
         inner=False,
         rname="right_",
     ) -> "Self":
@@ -1166,7 +1207,7 @@ class DataChain(DatasetQuery):
         if on is None:
             raise DatasetMergeError(["None"], None, "'on' must be specified")
-        if isinstance(on, str):
+        if isinstance(on, (str, sqlalchemy.ColumnElement)):
             on = [on]
         elif not isinstance(on, Sequence):
             raise DatasetMergeError(
@@ -1175,19 +1216,15 @@ class DataChain(DatasetQuery):
                 f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
             )
-        signals_schema = self.signals_schema.clone_without_sys_signals()
-        on_columns: list[str] = signals_schema.resolve(*on).db_signals()  # type: ignore[assignment]
-        right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
         if right_on is not None:
-            if isinstance(right_on, str):
+            if isinstance(right_on, (str, sqlalchemy.ColumnElement)):
                 right_on = [right_on]
             elif not isinstance(right_on, Sequence):
                 raise DatasetMergeError(
                     on,
                     right_on,
                     "'right_on' must be 'str' or 'Sequence' object"
-                    f" but got type '{right_on}'",
+                    f" but got type '{type(right_on)}'",
                 )
             if len(right_on) != len(on):
@@ -1195,34 +1232,39 @@ class DataChain(DatasetQuery):
                     on, right_on, "'on' and 'right_on' must have the same length'"
                 )
-            right_on_columns: list[str] = right_signals_schema.resolve(
-                *right_on
-            ).db_signals()  # type: ignore[assignment]
-            if len(right_on_columns) != len(on_columns):
-                on_str = ", ".join(right_on_columns)
-                right_on_str = ", ".join(right_on_columns)
-                raise DatasetMergeError(
-                    on,
-                    right_on,
-                    "'on' and 'right_on' must have the same number of columns in db'."
-                    f" on -> {on_str}, right_on -> {right_on_str}",
-                )
-        else:
-            right_on = on
-            right_on_columns = on_columns
         if self == right_ds:
             right_ds = right_ds.clone(new_table=True)
+        errors = []
+        def _resolve(
+            ds: DataChain,
+            col: Union[str, sqlalchemy.ColumnElement],
+            side: Union[str, None],
+        ):
+            try:
+                return ds.c(col) if isinstance(col, (str, C)) else col
+            except ValueError:
+                if side:
+                    errors.append(f"{_get_merge_error_str(col)} in {side}")
         ops = [
-            self.c(left) == right_ds.c(right)
-            for left, right in zip(on_columns, right_on_columns)
+            _resolve(self, left, "left")
+            == _resolve(right_ds, right, "right" if right_on else None)
+            for left, right in zip(on, right_on or on)
         ]
+        if errors:
+            raise DatasetMergeError(
+                on, right_on, f"Could not resolve {', '.join(errors)}"
+            )
         ds = self.join(right_ds, sqlalchemy.and_(*ops), inner, rname + "{name}")
         ds.feature_schema = None
+        signals_schema = self.signals_schema.clone_without_sys_signals()
+        right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
         ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
             right_signals_schema, rname
         )
@@ -1581,6 +1623,8 @@ class DataChain(DatasetQuery):
         model_name: str = "",
         source: bool = True,
         nrows=None,
+        session: Optional[Session] = None,
+        settings: Optional[dict] = None,
         **kwargs,
     ) -> "DataChain":
         """Generate chain from csv files.
@@ -1597,6 +1641,8 @@ class DataChain(DatasetQuery):
             model_name : Generated model name.
             source : Whether to include info about the source file.
             nrows : Optional row limit.
+            session : Session to use for the chain.
+            settings : Settings to use for the chain.
         Example:
             Reading a csv file:
@@ -1613,7 +1659,9 @@ class DataChain(DatasetQuery):
         from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
         from pyarrow.dataset import CsvFileFormat
-        chain = DataChain.from_storage(path, **kwargs)
+        chain = DataChain.from_storage(
+            path, session=session, settings=settings, **kwargs
+        )
         column_names = None
         if not header:
@@ -1660,6 +1708,8 @@ class DataChain(DatasetQuery):
         object_name: str = "",
         model_name: str = "",
         source: bool = True,
+        session: Optional[Session] = None,
+        settings: Optional[dict] = None,
         **kwargs,
     ) -> "DataChain":
         """Generate chain from parquet files.
@@ -1672,6 +1722,8 @@ class DataChain(DatasetQuery):
             object_name : Created object column name.
             model_name : Generated model name.
             source : Whether to include info about the source file.
+            session : Session to use for the chain.
+            settings : Settings to use for the chain.
         Example:
             Reading a single file:
@@ -1684,7 +1736,9 @@ class DataChain(DatasetQuery):
             dc = DataChain.from_parquet("s3://mybucket/dir")
             ```
         """
-        chain = DataChain.from_storage(path, **kwargs)
+        chain = DataChain.from_storage(
+            path, session=session, settings=settings, **kwargs
+        )
         return chain.parse_tabular(
             output=output,
             object_name=object_name,

datachain/lib/file.py CHANGED Viewed

@@ -195,14 +195,15 @@ class File(DataModel):
             with VFileRegistry.resolve(self, self.location) as f:  # type: ignore[arg-type]
                 yield f
-        uid = self.get_uid()
-        client = self._catalog.get_client(self.source)
-        if self._caching_enabled:
-            client.download(uid, callback=self._download_cb)
-        with client.open_object(
-            uid, use_cache=self._caching_enabled, cb=self._download_cb
-        ) as f:
-            yield io.TextIOWrapper(f) if mode == "r" else f
+        else:
+            uid = self.get_uid()
+            client = self._catalog.get_client(self.source)
+            if self._caching_enabled:
+                client.download(uid, callback=self._download_cb)
+            with client.open_object(
+                uid, use_cache=self._caching_enabled, cb=self._download_cb
+            ) as f:
+                yield io.TextIOWrapper(f) if mode == "r" else f
     def read(self, length: int = -1):
         """Returns file contents."""

datachain/lib/image.py CHANGED Viewed

@@ -34,7 +34,7 @@ def convert_image(
             from transformers.image_processing_utils import BaseImageProcessor
             if isinstance(transform, BaseImageProcessor):
-                img = torch.tensor(img.pixel_values[0])  # type: ignore[assignment,attr-defined]
+                img = torch.as_tensor(img.pixel_values[0]).clone().detach()  # type: ignore[assignment,attr-defined]
         except ImportError:
             pass
         if device:

datachain/lib/meta_formats.py CHANGED Viewed

@@ -1,15 +1,12 @@
-# pip install datamodel-code-generator
-# pip install jmespath
-#
 import csv
-import io
 import json
-import subprocess
-import sys
+import tempfile
 import uuid
 from collections.abc import Iterator
-from typing import Any, Callable
+from pathlib import Path
+from typing import Callable
+import datamodel_code_generator
 import jmespath as jsp
 from pydantic import BaseModel, ConfigDict, Field, ValidationError  # noqa: F401
@@ -47,9 +44,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
     data_string = ""
     # using uiid to get around issue #1617
     if not model_name:
-        uid_str = str(generate_uuid()).replace(
-            "-", ""
-        )  # comply with Python class names
+        # comply with Python class names
+        uid_str = str(generate_uuid()).replace("-", "")
         model_name = f"Model{data_type}{uid_str}"
     try:
         with source_file.open() as fd:  # CSV can be larger than memory
@@ -70,33 +66,26 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
         if data_type == "jsonl":
             data_type = "json"  # treat json line as plain JSON in auto-schema
         data_string = json.dumps(json_object)
-    command = [
-        "datamodel-codegen",
-        "--input-file-type",
-        data_type,
-        "--class-name",
-        model_name,
-        "--base-class",
-        "datachain.lib.meta_formats.UserModel",
-    ]
-    try:
-        result = subprocess.run(  # noqa: S603
-            command,
-            input=data_string,
-            text=True,
-            capture_output=True,
-            check=True,
+    input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
+    input_file_type = input_file_types[data_type]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output = Path(tmpdir) / "model.py"
+        datamodel_code_generator.generate(
+            data_string,
+            input_file_type=input_file_type,
+            output=output,
+            target_python_version=datamodel_code_generator.PythonVersion.PY_39,
+            base_class="datachain.lib.meta_formats.UserModel",
+            class_name=model_name,
+            additional_imports=["datachain.lib.data_model.DataModel"],
+            use_standard_collections=True,
         )
-        model_output = (
-            result.stdout
-        )  # This will contain the output from datamodel-codegen
-    except subprocess.CalledProcessError as e:
-        model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
-    print(f"{model_output}")
-    print("from datachain.lib.data_model import DataModel")
-    print("\n" + f"DataModel.register({model_name})" + "\n")
-    print("\n" + f"spec={model_name}" + "\n")
-    return model_output
+        epilogue = f"""
+DataModel.register({model_name})
+spec = {model_name}
+"""
+        return output.read_text() + epilogue
 #
@@ -113,35 +102,25 @@ def read_meta(  # noqa: C901
 ) -> Callable:
     from datachain.lib.dc import DataChain
-    # ugly hack: datachain is run redirecting printed outputs to a variable
     if schema_from:
-        captured_output = io.StringIO()
-        current_stdout = sys.stdout
-        sys.stdout = captured_output
-        try:
-            chain = (
-                DataChain.from_storage(schema_from, type="text")
-                .limit(1)
-                .map(  # dummy column created (#1615)
-                    meta_schema=lambda file: read_schema(
-                        file, data_type=meta_type, expr=jmespath, model_name=model_name
-                    ),
-                    output=str,
-                )
+        chain = (
+            DataChain.from_storage(schema_from, type="text")
+            .limit(1)
+            .map(  # dummy column created (#1615)
+                meta_schema=lambda file: read_schema(
+                    file, data_type=meta_type, expr=jmespath, model_name=model_name
+                ),
+                output=str,
             )
-            chain.exec()
-        finally:
-            sys.stdout = current_stdout
-        model_output = captured_output.getvalue()
-        captured_output.close()
+        )
+        (model_output,) = chain.collect("meta_schema")
         if print_schema:
             print(f"{model_output}")
         # Below 'spec' should be a dynamically converted DataModel from Pydantic
         if not spec:
-            local_vars: dict[str, Any] = {}
-            exec(model_output, globals(), local_vars)  # noqa: S102
-            spec = local_vars["spec"]
+            gl = globals()
+            exec(model_output, gl)  # type: ignore[arg-type] # noqa: S102
+            spec = gl["spec"]
     if not (spec) and not (schema_from):
         raise ValueError(

datachain/lib/model_store.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import inspect
 import logging
 from typing import ClassVar, Optional
@@ -69,7 +70,11 @@ class ModelStore:
     @staticmethod
     def is_pydantic(val):
-        return not hasattr(val, "__origin__") and issubclass(val, BaseModel)
+        return (
+            not hasattr(val, "__origin__")
+            and inspect.isclass(val)
+            and issubclass(val, BaseModel)
+        )
     @staticmethod
     def to_pydantic(val) -> Optional[type[BaseModel]]:

datachain/lib/text.py CHANGED Viewed

@@ -33,7 +33,7 @@ def convert_text(
         res = tokenizer(text)
     tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
-    tokens = torch.tensor(tokens)
+    tokens = torch.as_tensor(tokens).clone().detach()
     if device:
         tokens = tokens.to(device)

datachain/lib/webdataset.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import hashlib
 import json
 import tarfile
+import warnings
 from collections.abc import Iterator, Sequence
 from pathlib import Path
 from typing import (
@@ -19,6 +20,18 @@ from datachain.lib.data_model import DataModel
 from datachain.lib.file import File, TarVFile
 from datachain.lib.utils import DataChainError
+# The `json` method of the Pydantic `BaseModel` class has been deprecated
+# and will be removed in Pydantic v3. For more details, see:
+# https://github.com/pydantic/pydantic/issues/10033
+# Until then, we can ignore the warning.
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message=(
+        'Field name "json" in "WDSAllFile" shadows an attribute in parent "WDSBasic"'
+    ),
+)
 class WDSError(DataChainError):
     def __init__(self, tar_stream, message: str):

datachain/lib/webdataset_laion.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import warnings
 from collections.abc import Iterator
 from typing import Optional
@@ -7,6 +8,18 @@ from pydantic import BaseModel, Field
 from datachain.lib.file import File
 from datachain.lib.webdataset import WDSBasic, WDSReadableSubclass
+# The `json` method of the Pydantic `BaseModel` class has been deprecated
+# and will be removed in Pydantic v3. For more details, see:
+# https://github.com/pydantic/pydantic/issues/10033
+# Until then, we can ignore the warning.
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message=(
+        'Field name "json" in "WDSLaion" shadows an attribute in parent "WDSBasic"'
+    ),
+)
 class Laion(WDSReadableSubclass):
     uid: str = Field(default="")

datachain/query/dataset.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import contextlib
 import inspect
-import json
 import logging
 import os
 import random
@@ -37,11 +36,7 @@ from sqlalchemy.sql.selectable import Select
 from tqdm import tqdm
 from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
-from datachain.catalog import (
-    QUERY_SCRIPT_CANCELED_EXIT_CODE,
-    QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
-    get_catalog,
-)
+from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
 from datachain.data_storage.schema import (
     PARTITION_COLUMN_ID,
     partition_col_names,
@@ -1173,8 +1168,12 @@ class DatasetQuery:
         """
         return self.name is not None and self.version is not None
-    def c(self, name: Union[C, str]) -> "ColumnClause[Any]":
-        col = sqlalchemy.column(name) if isinstance(name, str) else name
+    def c(self, column: Union[C, str]) -> "ColumnClause[Any]":
+        col: sqlalchemy.ColumnClause = (
+            sqlalchemy.column(column)
+            if isinstance(column, str)
+            else sqlalchemy.column(column.name, column.type)
+        )
         col.table = self.table
         return col
@@ -1710,27 +1709,14 @@ class DatasetQuery:
         return self.__class__(name=name, version=version, catalog=self.catalog)
-def _get_output_fd_for_write() -> Union[str, int]:
-    handle = os.getenv("DATACHAIN_OUTPUT_FD")
-    if not handle:
-        return os.devnull
-    if os.name != "nt":
-        return int(handle)
-    import msvcrt
-    return msvcrt.open_osfhandle(int(handle), os.O_WRONLY)  # type: ignore[attr-defined]
-def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
+def query_wrapper(dataset_query: Any) -> Any:
     """
     Wrapper function that wraps the last statement of user query script.
     Last statement MUST be instance of DatasetQuery, otherwise script exits with
     error code 10
     """
     if not isinstance(dataset_query, DatasetQuery):
-        sys.exit(QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE)
+        return dataset_query
     catalog = dataset_query.catalog
     save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
@@ -1742,13 +1728,4 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
     if save and (is_session_temp_dataset or not dataset_query.attached):
         name = catalog.generate_query_dataset_name()
         dataset_query = dataset_query.save(name)
-    dataset: Optional[tuple[str, int]] = None
-    if dataset_query.attached:
-        assert dataset_query.name, "Dataset name should be provided"
-        assert dataset_query.version, "Dataset version should be provided"
-        dataset = dataset_query.name, dataset_query.version
-    with open(_get_output_fd_for_write(), mode="w") as f:
-        json.dump(dataset, f)
     return dataset_query

{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.3.10
+Version: 0.3.12
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -80,7 +80,6 @@ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
 Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
 Requires-Dist: pytest-servers[all] >=0.5.5 ; extra == 'tests'
 Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
-Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
 Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
 Requires-Dist: virtualenv ; extra == 'tests'
 Requires-Dist: dulwich ; extra == 'tests'
@@ -96,8 +95,14 @@ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
 Provides-Extra: vector
 Requires-Dist: usearch ; extra == 'vector'
+================
+|logo| DataChain
+================
 |PyPI| |Python Version| |Codecov| |Tests|
+.. |logo| image:: docs/assets/datachain.svg
+   :height: 24
 .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
    :target: https://pypi.org/project/datachain/
    :alt: PyPI
@@ -111,9 +116,6 @@ Requires-Dist: usearch ; extra == 'vector'
    :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
    :alt: Tests
-AI 🔗 DataChain
-----------------
 DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
 It is made to organize your unstructured data into datasets and wrangle it at scale on
 your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.

{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/RECORD RENAMED Viewed

@@ -6,8 +6,8 @@ datachain/cli.py,sha256=ECf_z5X8ILDJdUn2Cpb_z-ZjSRIzn7skiuMGfM-y9i0,30999
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
 datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
 datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
-datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
-datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
+datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
+datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
 datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
 datachain/node.py,sha256=dcm_7dVurFHpI0EHV2K6SjpJyh-gN4PVWAB-20quk04,6382
 datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
@@ -17,10 +17,9 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
 datachain/utils.py,sha256=HKUdVqreBTzcCULAYRw1sC6z33OaomVD1WoMSoFcPHg,13148
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=VO-Otcg3QLbb3E9H8gmgu-xJWQqIbWmLP2QyPg8cUos,75386
+datachain/catalog/catalog.py,sha256=xVFNUZ339u2l58ZyPaiJ6GsRRpwqq0LYUbdOHC-Otog,69654
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
-datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
 datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
 datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
@@ -40,27 +39,27 @@ datachain/data_storage/sqlite.py,sha256=Z4B2KDL4C8Uio2aLMxaKv0t2MoOtCV3bSqWg4X9m
 datachain/data_storage/warehouse.py,sha256=f7ETnYIXx5KMcPfwg_4bh_00QJiMLIliwE_41vmRGUo,33037
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
-datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
+datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
 datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
-datachain/lib/dc.py,sha256=TOC5-Ar8GQBkFpWkxVeg1og_iCJt_c0FCqA8IGzUrAk,66929
-datachain/lib/file.py,sha256=WOOYw3LcGROA6wshJ_aZkSgcTqfB4UxTbZDTx9KqAOg,11429
+datachain/lib/dc.py,sha256=gYRkrriG5RJxgLpOUccDU8DFRSoeWZjgmJwHfUo_z7w,68731
+datachain/lib/file.py,sha256=tNb3rJyRYGxpOc6XxcZjIQ9yVHKc7WLAOKoTYqp2TB0,11475
 datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
-datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
+datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
 datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
 datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
-datachain/lib/meta_formats.py,sha256=0YM7PMcGSLpUKZppyzFi8RvoSwYOqbciFGvzkvYdTXA,7133
-datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
+datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
+datachain/lib/model_store.py,sha256=xcrQ69-jcQs716U4UFOSoSKM7EvFIWqxlPhIcE4X7oI,2497
 datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
 datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
 datachain/lib/signal_schema.py,sha256=hqQLwUmt3w8RLa96MtubK9N2CBXqqTPrUkSRXc0ktt4,20275
-datachain/lib/text.py,sha256=vqs1SQdsw1vCzfvOanIeT4xY2R2TmPonElBgYDVeZmY,1241
+datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
 datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
 datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
 datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
 datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/lib/webdataset.py,sha256=Q3UlCk66341sq-nvFbBCX4Cv3cYXBK9n12ejG4axPXE,8298
-datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
+datachain/lib/webdataset.py,sha256=ZzGLtOUA-QjP4kttGgNqhrioDuDnomWFlsow4fLdezQ,8717
+datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
 datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
 datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
@@ -70,7 +69,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
 datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
 datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
 datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
-datachain/query/dataset.py,sha256=v5gCAWswv6DoEWkN7DuOc7BL4Afz8p5ZSA_GNxn5_R4,59056
+datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
 datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
 datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
 datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.3.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.3.10.dist-info/METADATA,sha256=eUsgu4Y4iK_rJbx66MCmeKuPaWS1iMKRL6mtbEB6ucY,17056
-datachain-0.3.10.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
-datachain-0.3.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.3.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.3.10.dist-info/RECORD,,
+datachain-0.3.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.3.12.dist-info/METADATA,sha256=I_Yz0lbiCk4KWv026U7zpDGrU72G575Hd_OnE_seb1k,17073
+datachain-0.3.12.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
+datachain-0.3.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.3.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.3.12.dist-info/RECORD,,

datachain/catalog/subclass.py DELETED Viewed

@@ -1,60 +0,0 @@
-import ast
-class SubclassFinder(ast.NodeVisitor):
-    """Finds subclasses of a target class in an AST."""
-    def __init__(self, target_classes: list[str]):
-        self.imports: list[ast.AST] = []
-        self.main_body: list[ast.AST] = []
-        self.target_classes: list[str] = target_classes
-        self.aliases: dict[str, str] = {}
-        self.feature_class: list[ast.AST] = []
-    def visit_ImportFrom(self, node):  # noqa: N802
-        module = node.module
-        for alias in node.names:
-            full_name = f"{module}.{alias.name}"
-            self.aliases[alias.asname or alias.name] = full_name
-        self.imports.append(node)
-    def visit_Import(self, node):  # noqa: N802
-        for alias in node.names:
-            self.aliases[alias.asname or alias.name] = alias.name
-        self.imports.append(node)
-    def visit_ClassDef(self, node):  # noqa: N802
-        base_names = [self.get_base_name(base) for base in node.bases]
-        if any(self.is_subclass(name) for name in base_names):
-            self.feature_class.append(node)
-        else:
-            self.main_body.append(node)
-    def visit(self, node):
-        if isinstance(
-            node,
-            (ast.Import, ast.ImportFrom, ast.ClassDef, ast.Module),
-        ):
-            return super().visit(node)
-        self.main_body.append(node)
-        return node
-    def get_base_name(self, node):
-        if isinstance(node, ast.Name):
-            return self.aliases.get(node.id, node.id)
-        if isinstance(node, ast.Attribute):
-            return self.get_full_attr_name(node)
-        if isinstance(node, ast.Subscript):
-            return self.get_base_name(node.value)
-        return None
-    def get_full_attr_name(self, node):
-        if isinstance(node.value, ast.Name):
-            return f"{node.value.id}.{node.attr}"
-        if isinstance(node.value, ast.Attribute):
-            return f"{self.get_full_attr_name(node.value)}.{node.attr}"
-        return node.attr
-    def is_subclass(self, base_name):
-        return base_name and base_name.split(".")[-1] in self.target_classes

{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

Potentially problematic release.

datachain 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl