PyPI - datachain - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

datachain 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (29) hide show

datachain/catalog/catalog.py +57 -212
datachain/cli.py +6 -38
datachain/client/fsspec.py +3 -0
datachain/client/hf.py +47 -0
datachain/data_storage/metastore.py +2 -29
datachain/data_storage/sqlite.py +3 -12
datachain/data_storage/warehouse.py +20 -29
datachain/dataset.py +44 -32
datachain/job.py +4 -3
datachain/lib/arrow.py +21 -5
datachain/lib/dataset_info.py +4 -0
datachain/lib/dc.py +183 -59
datachain/lib/file.py +10 -33
datachain/lib/hf.py +2 -1
datachain/lib/listing.py +102 -94
datachain/lib/listing_info.py +32 -0
datachain/lib/meta_formats.py +39 -56
datachain/lib/signal_schema.py +5 -2
datachain/node.py +13 -0
datachain/query/dataset.py +12 -105
datachain/query/metrics.py +8 -0
datachain/utils.py +5 -0
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/METADATA +7 -3
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/RECORD +28 -27
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/WHEEL +1 -1
datachain/catalog/subclass.py +0 -60
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/LICENSE +0 -0
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/entry_points.txt +0 -0
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -9,7 +9,6 @@ import os.path
 import posixpath
 import subprocess
 import sys
-import tempfile
 import time
 import traceback
 from collections.abc import Iterable, Iterator, Mapping, Sequence
@@ -77,7 +76,6 @@ from datachain.utils import (
 )
 from .datasource import DataSource
-from .subclass import SubclassFinder
 if TYPE_CHECKING:
     from datachain.data_storage import (
@@ -92,7 +90,6 @@ logger = logging.getLogger("datachain")
 DEFAULT_DATASET_DIR = "dataset"
 DATASET_FILE_SUFFIX = ".edatachain"
-FEATURE_CLASSES = ["DataModel"]
 TTL_INT = 4 * 60 * 60
@@ -156,8 +153,6 @@ class QueryResult(NamedTuple):
     dataset: Optional[DatasetRecord]
     version: Optional[int]
     output: str
-    preview: Optional[list[dict]]
-    metrics: dict[str, Any]
 class DatasetRowsFetcher(NodesThreadPool):
@@ -571,12 +566,6 @@ def find_column_to_str(  # noqa: PLR0911
     return ""
-def form_module_source(source_ast):
-    module = ast.Module(body=source_ast, type_ignores=[])
-    module = ast.fix_missing_locations(module)
-    return ast.unparse(module)
 class Catalog:
     def __init__(
         self,
@@ -660,33 +649,12 @@ class Catalog:
                     ),
                 ]
                 code_ast.body[-1:] = new_expressions
-            else:
-                raise Exception("Last line in a script was not an expression")
         return code_ast
-    def compile_query_script(
-        self, script: str, feature_module_name: str
-    ) -> tuple[Union[str, None], str]:
+    def compile_query_script(self, script: str) -> str:
         code_ast = ast.parse(script)
         code_ast = self.attach_query_wrapper(code_ast)
-        finder = SubclassFinder(FEATURE_CLASSES)
-        finder.visit(code_ast)
-        if not finder.feature_class:
-            main_module = form_module_source([*finder.imports, *finder.main_body])
-            return None, main_module
-        feature_import = ast.ImportFrom(
-            module=feature_module_name,
-            names=[ast.alias(name="*", asname=None)],
-            level=0,
-        )
-        feature_module = form_module_source([*finder.imports, *finder.feature_class])
-        main_module = form_module_source(
-            [*finder.imports, feature_import, *finder.main_body]
-        )
-        return feature_module, main_module
+        return ast.unparse(code_ast)
     def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
         config = config or self.client_config
@@ -1020,20 +988,6 @@ class Catalog:
         return node_groups
-    def unlist_source(self, uri: StorageURI) -> None:
-        self.metastore.clone(uri=uri).mark_storage_not_indexed(uri)
-    def storage_stats(self, uri: StorageURI) -> Optional[DatasetStats]:
-        """
-        Returns tuple with storage stats: total number of rows and total dataset size.
-        """
-        partial_path = self.metastore.get_last_partial_path(uri)
-        if partial_path is None:
-            return None
-        dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
-        return self.dataset_stats(dataset.name, dataset.latest_version)
     def create_dataset(
         self,
         name: str,
@@ -1297,19 +1251,6 @@ class Catalog:
         return self.get_dataset(name)
-    def register_new_dataset(
-        self,
-        source_dataset: DatasetRecord,
-        source_version: int,
-        target_name: str,
-    ) -> DatasetRecord:
-        target_dataset = self.metastore.create_dataset(
-            target_name,
-            query_script=source_dataset.query_script,
-            schema=source_dataset.serialized_schema,
-        )
-        return self.register_dataset(source_dataset, source_version, target_dataset, 1)
     def register_dataset(
         self,
         dataset: DatasetRecord,
@@ -1422,17 +1363,18 @@ class Catalog:
         return direct_dependencies
-    def ls_datasets(self) -> Iterator[DatasetRecord]:
+    def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
         datasets = self.metastore.list_datasets()
         for d in datasets:
-            if not d.is_bucket_listing:
+            if not d.is_bucket_listing or include_listing:
                 yield d
     def list_datasets_versions(
         self,
+        include_listing: bool = False,
     ) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
         """Iterate over all dataset versions with related jobs."""
-        datasets = list(self.ls_datasets())
+        datasets = list(self.ls_datasets(include_listing=include_listing))
         # preselect dataset versions jobs from db to avoid multiple queries
         jobs_ids: set[str] = {
@@ -1444,7 +1386,8 @@ class Catalog:
         for d in datasets:
             yield from (
-                (d, v, jobs.get(v.job_id) if v.job_id else None) for v in d.versions
+                (d, v, jobs.get(str(v.job_id)) if v.job_id else None)
+                for v in d.versions
             )
     def ls_dataset_rows(
@@ -1632,15 +1575,6 @@ class Catalog:
         for source in data_sources:  # type: ignore [union-attr]
             yield source, source.ls(fields)
-    def ls_storage_uris(self) -> Iterator[str]:
-        yield from self.metastore.get_all_storage_uris()
-    def get_storage(self, uri: StorageURI) -> Storage:
-        return self.metastore.get_storage(uri)
-    def ls_storages(self) -> list[Storage]:
-        return self.metastore.list_storages()
     def pull_dataset(
         self,
         dataset_uri: str,
@@ -1874,10 +1808,6 @@ class Catalog:
         envs: Optional[Mapping[str, str]] = None,
         python_executable: Optional[str] = None,
         save: bool = False,
-        save_as: Optional[str] = None,
-        preview_limit: int = 10,
-        preview_offset: int = 0,
-        preview_columns: Optional[list[str]] = None,
         capture_output: bool = True,
         output_hook: Callable[[str], None] = noop,
         params: Optional[dict[str, str]] = None,
@@ -1905,34 +1835,25 @@ class Catalog:
                 C.size > 1000
             )
         """
-        from datachain.query.dataset import ExecutionResult
-        feature_file = tempfile.NamedTemporaryFile(  # noqa: SIM115
-            dir=os.getcwd(), suffix=".py", delete=False
-        )
-        _, feature_module = os.path.split(feature_file.name)
-        try:
-            lines, proc, response_text = self.run_query(
-                python_executable or sys.executable,
-                query_script,
-                envs,
-                feature_file,
-                capture_output,
-                feature_module,
-                output_hook,
-                params,
-                preview_columns,
-                preview_limit,
-                preview_offset,
-                save,
-                save_as,
-                job_id,
+        if not job_id:
+            python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
+            job_id = self.metastore.create_job(
+                name="",
+                query=query_script,
+                params=params,
+                python_version=python_version,
             )
-        finally:
-            feature_file.close()
-            os.unlink(feature_file.name)
+        lines, proc = self.run_query(
+            python_executable or sys.executable,
+            query_script,
+            envs,
+            capture_output,
+            output_hook,
+            params,
+            save,
+            job_id,
+        )
         output = "".join(lines)
         if proc.returncode:
@@ -1942,105 +1863,69 @@ class Catalog:
                     return_code=proc.returncode,
                     output=output,
                 )
-            if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
-                raise QueryScriptRunError(
-                    "Last line in a script was not an instance of DataChain",
-                    return_code=proc.returncode,
-                    output=output,
-                )
             raise QueryScriptRunError(
                 f"Query script exited with error code {proc.returncode}",
                 return_code=proc.returncode,
                 output=output,
             )
+        def _get_dataset_versions_by_job_id():
+            for dr, dv, job in self.list_datasets_versions():
+                if job and str(job.id) == job_id:
+                    yield dr, dv
         try:
-            response = json.loads(response_text)
-        except ValueError:
-            response = {}
-        exec_result = ExecutionResult(**response)
-        dataset: Optional[DatasetRecord] = None
-        version: Optional[int] = None
-        if save or save_as:
-            dataset, version = self.save_result(
-                query_script, exec_result, output, version, job_id
+            dr, dv = max(
+                _get_dataset_versions_by_job_id(), key=lambda x: x[1].created_at
             )
+        except ValueError as e:
+            if not save:
+                return QueryResult(dataset=None, version=None, output=output)
-        return QueryResult(
-            dataset=dataset,
-            version=version,
-            output=output,
-            preview=exec_result.preview,
-            metrics=exec_result.metrics,
+            raise QueryScriptDatasetNotFound(
+                "No dataset found after running Query script",
+                output=output,
+            ) from e
+        dr = self.update_dataset(
+            dr,
+            script_output=output,
+            query_script=query_script,
         )
+        self.update_dataset_version_with_warehouse_info(
+            dr,
+            dv.version,
+            script_output=output,
+            query_script=query_script,
+            job_id=job_id,
+            is_job_result=True,
+        )
+        return QueryResult(dataset=dr, version=dv.version, output=output)
     def run_query(
         self,
         python_executable: str,
         query_script: str,
         envs: Optional[Mapping[str, str]],
-        feature_file: IO[bytes],
         capture_output: bool,
-        feature_module: str,
         output_hook: Callable[[str], None],
         params: Optional[dict[str, str]],
-        preview_columns: Optional[list[str]],
-        preview_limit: int,
-        preview_offset: int,
         save: bool,
-        save_as: Optional[str],
         job_id: Optional[str],
-    ) -> tuple[list[str], subprocess.Popen, str]:
+    ) -> tuple[list[str], subprocess.Popen]:
         try:
-            feature_code, query_script_compiled = self.compile_query_script(
-                query_script, feature_module[:-3]
-            )
-            if feature_code:
-                feature_file.write(feature_code.encode())
-                feature_file.flush()
+            query_script_compiled = self.compile_query_script(query_script)
         except Exception as exc:
             raise QueryScriptCompileError(
                 f"Query script failed to compile, reason: {exc}"
             ) from exc
-        if save_as and save_as.startswith(QUERY_DATASET_PREFIX):
-            raise ValueError(
-                f"Cannot use {QUERY_DATASET_PREFIX} prefix for dataset name"
-            )
-        r, w = os.pipe()
-        if os.name == "nt":
-            import msvcrt
-            os.set_inheritable(w, True)
-            startupinfo = subprocess.STARTUPINFO()  # type: ignore[attr-defined]
-            handle = msvcrt.get_osfhandle(w)  # type: ignore[attr-defined]
-            startupinfo.lpAttributeList["handle_list"].append(handle)
-            kwargs: dict[str, Any] = {"startupinfo": startupinfo}
-        else:
-            handle = w
-            kwargs = {"pass_fds": [w]}
         envs = dict(envs or os.environ)
-        if feature_code:
-            envs["DATACHAIN_FEATURE_CLASS_SOURCE"] = json.dumps(
-                {feature_module: feature_code}
-            )
         envs.update(
             {
                 "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
                 "PYTHONPATH": os.getcwd(),  # For local imports
-                "DATACHAIN_QUERY_PREVIEW_ARGS": json.dumps(
-                    {
-                        "limit": preview_limit,
-                        "offset": preview_offset,
-                        "columns": preview_columns,
-                    }
-                ),
                 "DATACHAIN_QUERY_SAVE": "1" if save else "",
-                "DATACHAIN_QUERY_SAVE_AS": save_as or "",
                 "PYTHONUNBUFFERED": "1",
-                "DATACHAIN_OUTPUT_FD": str(handle),
                 "DATACHAIN_JOB_ID": job_id or "",
             },
         )
@@ -2051,52 +1936,12 @@ class Catalog:
             stderr=subprocess.STDOUT if capture_output else None,
             bufsize=1,
             text=False,
-            **kwargs,
         ) as proc:
-            os.close(w)
             out = proc.stdout
             _lines: list[str] = []
             ctx = print_and_capture(out, output_hook) if out else nullcontext(_lines)
-            with ctx as lines, open(r) as f:
-                response_text = ""
-                while proc.poll() is None:
-                    response_text += f.readline()
-                    time.sleep(0.1)
-                response_text += f.readline()
-        return lines, proc, response_text
-    def save_result(self, query_script, exec_result, output, version, job_id):
-        if not exec_result.dataset:
-            raise QueryScriptDatasetNotFound(
-                "No dataset found after running Query script",
-                output=output,
-            )
-        name, version = exec_result.dataset
-        # finding returning dataset
-        try:
-            dataset = self.get_dataset(name)
-            dataset.get_version(version)
-        except (DatasetNotFoundError, ValueError) as e:
-            raise QueryScriptDatasetNotFound(
-                "No dataset found after running Query script",
-                output=output,
-            ) from e
-        dataset = self.update_dataset(
-            dataset,
-            script_output=output,
-            query_script=query_script,
-        )
-        self.update_dataset_version_with_warehouse_info(
-            dataset,
-            version,
-            script_output=output,
-            query_script=query_script,
-            job_id=job_id,
-            is_job_result=True,
-        )
-        return dataset, version
+            with ctx as lines:
+                return lines, proc
     def cp(
         self,

datachain/cli.py CHANGED Viewed

@@ -14,6 +14,7 @@ import shtab
 from datachain import utils
 from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
+from datachain.lib.dc import DataChain
 from datachain.utils import DataChainDir
 if TYPE_CHECKING:
@@ -472,9 +473,6 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     query_parser.add_argument(
         "script", metavar="<script.py>", type=str, help="Filepath for script"
     )
-    query_parser.add_argument(
-        "dataset_name", nargs="?", type=str, help="Save result dataset as"
-    )
     query_parser.add_argument(
         "--parallel",
         nargs="?",
@@ -487,7 +485,6 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
             "N defaults to the CPU count."
         ),
     )
-    add_show_args(query_parser)
     query_parser.add_argument(
         "-p",
         "--param",
@@ -619,18 +616,6 @@ def _ls_urls_flat(
                 raise FileNotFoundError(f"No such file or directory: {source}")
-def ls_indexed_storages(catalog: "Catalog", long: bool = False) -> Iterator[str]:
-    from datachain.node import long_line_str
-    storage_uris = catalog.ls_storage_uris()
-    if long:
-        for uri in storage_uris:
-            # TODO: add Storage.created so it can be used here
-            yield long_line_str(uri, None, "")
-    else:
-        yield from storage_uris
 def ls_local(
     sources,
     long: bool = False,
@@ -661,8 +646,9 @@ def ls_local(
                 for entry in entries:
                     print(format_ls_entry(entry))
     else:
-        for entry in ls_indexed_storages(catalog, long=long):
-            print(format_ls_entry(entry))
+        chain = DataChain.listings()
+        for ls in chain.collect("listing"):
+            print(format_ls_entry(f"{ls.uri}@v{ls.version}"))  # type: ignore[union-attr]
 def format_ls_entry(entry: str) -> str:
@@ -813,16 +799,10 @@ def show(
 def query(
     catalog: "Catalog",
     script: str,
-    dataset_name: Optional[str] = None,
     parallel: Optional[int] = None,
-    limit: int = 10,
-    offset: int = 0,
-    columns: Optional[list[str]] = None,
-    no_collapse: bool = False,
     params: Optional[dict[str, str]] = None,
 ) -> None:
     from datachain.data_storage import JobQueryType, JobStatus
-    from datachain.utils import show_records
     with open(script, encoding="utf-8") as f:
         script_content = f.read()
@@ -843,13 +823,9 @@ def query(
     )
     try:
-        result = catalog.query(
+        catalog.query(
             script_content,
             python_executable=python_executable,
-            save_as=dataset_name,
-            preview_limit=limit,
-            preview_offset=offset,
-            preview_columns=columns,
             capture_output=False,
             params=params,
             job_id=job_id,
@@ -864,10 +840,7 @@ def query(
             error_stack=error_stack,
         )
         raise
-    catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE, metrics=result.metrics)
-    show_records(result.preview, collapse_columns=not no_collapse)
+    catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
 def clear_cache(catalog: "Catalog"):
@@ -1042,12 +1015,7 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             query(
                 catalog,
                 args.script,
-                dataset_name=args.dataset_name,
                 parallel=args.parallel,
-                limit=args.limit,
-                offset=args.offset,
-                columns=args.columns,
-                no_collapse=args.no_collapse,
                 params=args.param,
             )
         elif args.command == "apply-udf":

datachain/client/fsspec.py CHANGED Viewed

@@ -87,6 +87,7 @@ class Client(ABC):
     def get_implementation(url: str) -> type["Client"]:
         from .azure import AzureClient
         from .gcs import GCSClient
+        from .hf import HfClient
         from .local import FileClient
         from .s3 import ClientS3
@@ -104,6 +105,8 @@ class Client(ABC):
             return AzureClient
         if protocol == FileClient.protocol:
             return FileClient
+        if protocol == HfClient.protocol:
+            return HfClient
         raise NotImplementedError(f"Unsupported protocol: {protocol}")

datachain/client/hf.py ADDED Viewed

@@ -0,0 +1,47 @@
+import os
+import posixpath
+from typing import Any, cast
+from huggingface_hub import HfFileSystem
+from datachain.lib.file import File
+from datachain.node import Entry
+from .fsspec import Client
+class HfClient(Client):
+    FS_CLASS = HfFileSystem
+    PREFIX = "hf://"
+    protocol = "hf"
+    @classmethod
+    def create_fs(cls, **kwargs) -> HfFileSystem:
+        if os.environ.get("HF_TOKEN"):
+            kwargs["token"] = os.environ["HF_TOKEN"]
+        return cast(HfFileSystem, super().create_fs(**kwargs))
+    def convert_info(self, v: dict[str, Any], path: str) -> Entry:
+        return Entry.from_file(
+            path=path,
+            size=v["size"],
+            version=v["last_commit"].oid,
+            etag=v.get("blob_id", ""),
+            last_modified=v["last_commit"].date,
+        )
+    def info_to_file(self, v: dict[str, Any], path: str) -> File:
+        return File(
+            path=path,
+            size=v["size"],
+            version=v["last_commit"].oid,
+            etag=v.get("blob_id", ""),
+            last_modified=v["last_commit"].date,
+        )
+    async def ls_dir(self, path):
+        return self.fs.ls(path, detail=True)
+    def rel_path(self, path):
+        return posixpath.relpath(path, self.name)

datachain/data_storage/metastore.py CHANGED Viewed

@@ -167,21 +167,10 @@ class AbstractMetastore(ABC, Serializable):
         This method should be called when index operation is finished.
         """
-    @abstractmethod
-    def mark_storage_not_indexed(self, uri: StorageURI) -> None:
-        """
-        Mark storage as not indexed.
-        This method should be called when storage index is deleted.
-        """
     @abstractmethod
     def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
         """Updates last inserted datetime in bucket with current time."""
-    @abstractmethod
-    def get_all_storage_uris(self) -> Iterator[StorageURI]:
-        """Returns all storage uris."""
     @abstractmethod
     def get_storage(self, uri: StorageURI) -> Storage:
         """
@@ -189,10 +178,6 @@ class AbstractMetastore(ABC, Serializable):
         E.g. if s3 is used as storage this would be s3 bucket data.
         """
-    @abstractmethod
-    def list_storages(self) -> list[Storage]:
-        """Returns all storages."""
     @abstractmethod
     def mark_storage_pending(self, storage: Storage) -> Storage:
         """Marks storage as pending."""
@@ -324,7 +309,7 @@ class AbstractMetastore(ABC, Serializable):
             self.add_dataset_dependency(
                 source_dataset_name,
                 source_dataset_version,
-                dependency.name,
+                dependency.dataset_name,
                 int(dependency.version),
             )
         else:
@@ -906,11 +891,6 @@ class AbstractDBMetastore(AbstractMetastore):
             self._storages_update().where(s.c.uri == uri).values(**updates)  # type: ignore [attr-defined]
         )
-    def get_all_storage_uris(self) -> Iterator[StorageURI]:
-        """Returns all storage uris."""
-        s = self._storages
-        yield from (r[0] for r in self.db.execute(self._storages_select(s.c.uri)))
     def get_storage(self, uri: StorageURI, conn=None) -> Storage:
         """
         Gets storage representation from database.
@@ -926,13 +906,6 @@ class AbstractDBMetastore(AbstractMetastore):
         return self.storage_class._make(result)
-    def list_storages(self) -> list[Storage]:
-        result = self.db.execute(self._storages_select())
-        if not result:
-            return []
-        return [self.storage_class._make(r) for r in result]
     def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
         # Update status to pending and dates
         updates = {
@@ -1503,7 +1476,7 @@ class AbstractDBMetastore(AbstractMetastore):
         return self._jobs.update().where(*where)
     def _parse_job(self, rows) -> Job:
-        return Job.parse(*rows)
+        return self.job_class.parse(*rows)
     def _parse_jobs(self, rows) -> Iterator["Job"]:
         for _, g in groupby(rows, lambda r: r[0]):

datachain 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

Potentially problematic release.

datachain 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl