PyPI - datachain - Versions diffs - 0.1.12__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

datachain 0.1.12py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show

datachain/_version.py +2 -2
datachain/asyn.py +3 -3
datachain/catalog/__init__.py +3 -3
datachain/catalog/catalog.py +6 -6
datachain/catalog/loader.py +3 -3
datachain/cli.py +2 -1
datachain/client/azure.py +37 -1
datachain/client/fsspec.py +1 -1
datachain/client/local.py +1 -1
datachain/data_storage/__init__.py +1 -1
datachain/data_storage/metastore.py +11 -3
datachain/data_storage/schema.py +2 -3
datachain/data_storage/warehouse.py +31 -30
datachain/dataset.py +1 -3
datachain/lib/arrow.py +85 -0
datachain/lib/dc.py +377 -178
datachain/lib/feature.py +41 -90
datachain/lib/feature_registry.py +3 -1
datachain/lib/feature_utils.py +2 -2
datachain/lib/file.py +20 -20
datachain/lib/image.py +9 -2
datachain/lib/meta_formats.py +66 -34
datachain/lib/settings.py +5 -5
datachain/lib/signal_schema.py +103 -105
datachain/lib/udf.py +3 -12
datachain/lib/udf_signature.py +11 -6
datachain/lib/webdataset_laion.py +5 -22
datachain/listing.py +8 -8
datachain/node.py +1 -1
datachain/progress.py +1 -1
datachain/query/builtins.py +1 -1
datachain/query/dataset.py +39 -110
datachain/query/dispatch.py +1 -1
datachain/query/metrics.py +19 -0
datachain/query/schema.py +13 -3
datachain/sql/__init__.py +1 -1
datachain/utils.py +1 -122
{datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
{datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
{datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
datachain/lib/parquet.py +0 -32
{datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
{datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
{datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0

datachain/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.1.12'
-__version_tuple__ = version_tuple = (0, 1, 12)
+__version__ = version = '0.2.0'
+__version_tuple__ = version_tuple = (0, 2, 0)

datachain/asyn.py CHANGED Viewed

@@ -82,13 +82,13 @@ class AsyncMapper(Generic[InputT, ResultT]):
         for _i in range(self.workers):
             self.start_task(self.worker())
         try:
-            done, pending = await asyncio.wait(
+            done, _pending = await asyncio.wait(
                 self._tasks, return_when=asyncio.FIRST_COMPLETED
             )
             self.gather_exceptions(done)
             assert producer.done()
             join = self.start_task(self.work_queue.join())
-            done, pending = await asyncio.wait(
+            done, _pending = await asyncio.wait(
                 self._tasks, return_when=asyncio.FIRST_COMPLETED
             )
             self.gather_exceptions(done)
@@ -208,7 +208,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
     async def _pop_result(self) -> Optional[ResultT]:
         if self.heap and self.heap[0][0] == self._next_yield:
-            i, out = heappop(self.heap)
+            _i, out = heappop(self.heap)
         else:
             self._getters[self._next_yield] = get_value = self.loop.create_future()
             out = await get_value

datachain/catalog/__init__.py CHANGED Viewed

@@ -8,10 +8,10 @@ from .catalog import (
 from .loader import get_catalog
 __all__ = [
+    "QUERY_DATASET_PREFIX",
+    "QUERY_SCRIPT_CANCELED_EXIT_CODE",
+    "QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
     "Catalog",
     "get_catalog",
     "parse_edatachain_file",
-    "QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
-    "QUERY_SCRIPT_CANCELED_EXIT_CODE",
-    "QUERY_DATASET_PREFIX",
 ]

datachain/catalog/catalog.py CHANGED Viewed

@@ -142,6 +142,7 @@ class QueryResult(NamedTuple):
     version: Optional[int]
     output: str
     preview: Optional[list[dict]]
+    metrics: dict[str, Any]
 class DatasetRowsFetcher(NodesThreadPool):
@@ -876,13 +877,11 @@ class Catalog:
             # so this is to improve performance
             return None
-        dsrc_all = []
+        dsrc_all: list[DataSource] = []
         for listing, file_path in enlisted_sources:
             nodes = listing.expand_path(file_path)
             dir_only = file_path.endswith("/")
-            for node in nodes:
-                dsrc_all.append(DataSource(listing, node, dir_only))
+            dsrc_all.extend(DataSource(listing, node, dir_only) for node in nodes)
         return dsrc_all
     def enlist_sources_grouped(
@@ -1997,6 +1996,7 @@ class Catalog:
             version=version,
             output=output,
             preview=exec_result.preview,
+            metrics=exec_result.metrics,
         )
     def run_query(
@@ -2068,8 +2068,8 @@ class Catalog:
                 "DATACHAIN_JOB_ID": job_id or "",
             },
         )
-        with subprocess.Popen(
-            [python_executable, "-c", query_script_compiled],  # noqa: S603
+        with subprocess.Popen(  # noqa: S603
+            [python_executable, "-c", query_script_compiled],
             env=envs,
             stdout=subprocess.PIPE if capture_output else None,
             stderr=subprocess.STDOUT if capture_output else None,

datachain/catalog/loader.py CHANGED Viewed

@@ -35,7 +35,7 @@ def get_id_generator() -> "AbstractIDGenerator":
         id_generator_obj = deserialize(id_generator_serialized)
         if not isinstance(id_generator_obj, AbstractIDGenerator):
             raise RuntimeError(
-                f"Deserialized ID generator is not an instance of AbstractIDGenerator: "
+                "Deserialized ID generator is not an instance of AbstractIDGenerator: "
                 f"{id_generator_obj}"
             )
         return id_generator_obj
@@ -67,7 +67,7 @@ def get_metastore(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractMet
         metastore_obj = deserialize(metastore_serialized)
         if not isinstance(metastore_obj, AbstractMetastore):
             raise RuntimeError(
-                f"Deserialized Metastore is not an instance of AbstractMetastore: "
+                "Deserialized Metastore is not an instance of AbstractMetastore: "
                 f"{metastore_obj}"
             )
         return metastore_obj
@@ -101,7 +101,7 @@ def get_warehouse(id_generator: Optional["AbstractIDGenerator"]) -> "AbstractWar
         warehouse_obj = deserialize(warehouse_serialized)
         if not isinstance(warehouse_obj, AbstractWarehouse):
             raise RuntimeError(
-                f"Deserialized Warehouse is not an instance of AbstractWarehouse: "
+                "Deserialized Warehouse is not an instance of AbstractWarehouse: "
                 f"{warehouse_obj}"
             )
         return warehouse_obj

datachain/cli.py CHANGED Viewed

@@ -845,6 +845,7 @@ def query(
         query=script_content,
         query_type=JobQueryType.PYTHON,
         python_version=python_version,
+        params=params,
     )
     try:
@@ -870,7 +871,7 @@ def query(
         )
         raise
-    catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
+    catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE, metrics=result.metrics)
     show_records(result.preview, collapse_columns=not no_collapse)

datachain/client/azure.py CHANGED Viewed

@@ -1,10 +1,12 @@
+import posixpath
 from typing import Any
 from adlfs import AzureBlobFileSystem
+from tqdm import tqdm
 from datachain.node import Entry
-from .fsspec import DELIMITER, Client
+from .fsspec import DELIMITER, Client, ResultQueue
 class AzureClient(Client):
@@ -28,3 +30,37 @@ class AzureClient(Client):
             last_modified=v["last_modified"],
             size=v.get("size", ""),
         )
+    async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
+        prefix = start_prefix
+        if prefix:
+            prefix = prefix.lstrip(DELIMITER) + DELIMITER
+        found = False
+        try:
+            with tqdm(desc=f"Listing {self.uri}", unit=" objects") as pbar:
+                async with self.fs.service_client.get_container_client(
+                    container=self.name
+                ) as container_client:
+                    async for page in container_client.list_blobs(
+                        include=["metadata", "versions"], name_starts_with=prefix
+                    ).by_page():
+                        entries = []
+                        async for b in page:
+                            found = True
+                            if not self._is_valid_key(b["name"]):
+                                continue
+                            info = (await self.fs._details([b]))[0]
+                            full_path = info["name"]
+                            parent = posixpath.dirname(self.rel_path(full_path))
+                            entries.append(self.convert_info(info, parent))
+                        if entries:
+                            await result_queue.put(entries)
+                            pbar.update(len(entries))
+                    if not found:
+                        raise FileNotFoundError(
+                            f"Unable to resolve remote path: {prefix}"
+                        )
+        finally:
+            result_queue.put_nowait(None)
+    _fetch_default = _fetch_flat

datachain/client/fsspec.py CHANGED Viewed

@@ -202,7 +202,7 @@ class Client(ABC):
         try:
             impl = getattr(self, f"_fetch_{method}")
         except AttributeError:
-            raise ValueError("Unknown indexing method '{method}'") from None
+            raise ValueError(f"Unknown indexing method '{method}'") from None
         result_queue: ResultQueue = asyncio.Queue()
         loop = get_loop()
         main_task = loop.create_task(impl(start_prefix, result_queue))

datachain/client/local.py CHANGED Viewed

@@ -135,7 +135,7 @@ class FileClient(Client):
         return posixpath.relpath(path, self.name)
     def get_full_path(self, rel_path):
-        full_path = Path(self.name, rel_path).as_uri()
+        full_path = Path(self.name, rel_path).as_posix()
         if rel_path.endswith("/") or not rel_path:
             full_path += "/"
         return full_path

datachain/data_storage/__init__.py CHANGED Viewed

@@ -5,8 +5,8 @@ from .warehouse import AbstractWarehouse
 __all__ = [
     "AbstractDBIDGenerator",
-    "AbstractIDGenerator",
     "AbstractDBMetastore",
+    "AbstractIDGenerator",
     "AbstractMetastore",
     "AbstractWarehouse",
     "JobQueryType",

datachain/data_storage/metastore.py CHANGED Viewed

@@ -385,6 +385,7 @@ class AbstractMetastore(ABC, Serializable):
         query_type: JobQueryType = JobQueryType.PYTHON,
         workers: int = 1,
         python_version: Optional[str] = None,
+        params: Optional[dict[str, str]] = None,
     ) -> str:
         """
         Creates a new job.
@@ -398,6 +399,7 @@ class AbstractMetastore(ABC, Serializable):
         status: JobStatus,
         error_message: Optional[str] = None,
         error_stack: Optional[str] = None,
+        metrics: Optional[dict[str, Any]] = None,
     ) -> None:
         """Set the status of the given job."""
@@ -1165,9 +1167,7 @@ class AbstractDBMetastore(AbstractMetastore):
         return dataset_version
     def _parse_dataset(self, rows) -> Optional[DatasetRecord]:
-        versions = []
-        for r in rows:
-            versions.append(self.dataset_class.parse(*r))
+        versions = [self.dataset_class.parse(*r) for r in rows]
         if not versions:
             return None
         return reduce(lambda ds, version: ds.merge_versions(version), versions)
@@ -1463,6 +1463,8 @@ class AbstractDBMetastore(AbstractMetastore):
             Column("python_version", Text, nullable=True),
             Column("error_message", Text, nullable=False, default=""),
             Column("error_stack", Text, nullable=False, default=""),
+            Column("params", JSON, nullable=False),
+            Column("metrics", JSON, nullable=False),
         ]
     @cached_property
@@ -1489,6 +1491,7 @@ class AbstractDBMetastore(AbstractMetastore):
         query_type: JobQueryType = JobQueryType.PYTHON,
         workers: int = 1,
         python_version: Optional[str] = None,
+        params: Optional[dict[str, str]] = None,
         conn: Optional[Any] = None,
     ) -> str:
         """
@@ -1508,6 +1511,8 @@ class AbstractDBMetastore(AbstractMetastore):
                 python_version=python_version,
                 error_message="",
                 error_stack="",
+                params=json.dumps(params or {}),
+                metrics=json.dumps({}),
             ),
             conn=conn,
         )
@@ -1519,6 +1524,7 @@ class AbstractDBMetastore(AbstractMetastore):
         status: JobStatus,
         error_message: Optional[str] = None,
         error_stack: Optional[str] = None,
+        metrics: Optional[dict[str, Any]] = None,
         conn: Optional[Any] = None,
     ) -> None:
         """Set the status of the given job."""
@@ -1529,6 +1535,8 @@ class AbstractDBMetastore(AbstractMetastore):
             values["error_message"] = error_message
         if error_stack:
             values["error_stack"] = error_stack
+        if metrics:
+            values["metrics"] = json.dumps(metrics)
         self.db.execute(
             self._jobs_update(self._jobs.c.id == job_id).values(**values),
             conn=conn,

datachain/data_storage/schema.py CHANGED Viewed

@@ -34,8 +34,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
         if ec := c_set.get(c.name, None):
             if str(ec.type) != str(c.type):
                 raise ValueError(
-                    f"conflicting types for column {c.name}:"
-                    f"{c.type!s} and {ec.type!s}"
+                    f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
                 )
             continue
         c_set[c.name] = c
@@ -235,6 +234,7 @@ class DataTable:
     def file_columns(cls) -> list[sa.Column]:
         return [
             sa.Column("id", Int, primary_key=True),
+            sa.Column("random", Int64, nullable=False),
             sa.Column("vtype", String, nullable=False, index=True),
             sa.Column("dir_type", Int, index=True),
             sa.Column("parent", String, index=True),
@@ -246,7 +246,6 @@ class DataTable:
             sa.Column("size", Int64, nullable=False, index=True),
             sa.Column("owner_name", String),
             sa.Column("owner_id", String),
-            sa.Column("random", Int64, nullable=False),
             sa.Column("location", JSON),
             sa.Column("source", String, nullable=False),
         ]

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -95,14 +95,14 @@ class AbstractWarehouse(ABC, Serializable):
         exc = None
         try:
-            if col_python_type == list and value_type in (list, tuple, set):
+            if col_python_type is list and value_type in (list, tuple, set):
                 if len(val) == 0:
                     return []
                 item_python_type = self.python_type(col_type.item_type)
-                if item_python_type != list:
+                if item_python_type is not list:
                     if isinstance(val[0], item_python_type):
                         return val
-                    if item_python_type == float and isinstance(val[0], int):
+                    if item_python_type is float and isinstance(val[0], int):
                         return [float(i) for i in val]
                 # Optimization: Reuse these values for each function call within the
                 # list comprehension.
@@ -114,18 +114,18 @@ class AbstractWarehouse(ABC, Serializable):
                 )
                 return [self.convert_type(i, *item_type_info) for i in val]
             # Special use case with JSON type as we save it as string
-            if col_python_type == dict or col_type_name == "JSON":
-                if value_type == str:
+            if col_python_type is dict or col_type_name == "JSON":
+                if value_type is str:
                     return val
                 if value_type in (dict, list):
                     return json.dumps(val)
                 raise ValueError(
-                    f"Cannot convert value {val!r} with type" f"{value_type} to JSON"
+                    f"Cannot convert value {val!r} with type {value_type} to JSON"
                 )
             if isinstance(val, col_python_type):
                 return val
-            if col_python_type == float and isinstance(val, int):
+            if col_python_type is float and isinstance(val, int):
                 return float(val)
         except Exception as e:  # noqa: BLE001
             exc = e
@@ -335,6 +335,7 @@ class AbstractWarehouse(ABC, Serializable):
             return select_query
         if recursive:
             root = False
+            where = self.path_expr(dr).op("GLOB")(path)
             if not path or path == "/":
                 # root of the bucket, e.g s3://bucket/ -> getting all the nodes
                 # in the bucket
@@ -344,14 +345,18 @@ class AbstractWarehouse(ABC, Serializable):
                 # not a root and not a explicit glob, so it's pointing to some directory
                 # and we are adding a proper glob syntax for it
                 # e.g s3://bucket/dir1 -> s3://bucket/dir1/*
-                path = path.rstrip("/") + "/*"
+                dir_path = path.rstrip("/") + "/*"
+                where = where | self.path_expr(dr).op("GLOB")(dir_path)
             if not root:
                 # not a root, so running glob query
-                select_query = select_query.where(self.path_expr(dr).op("GLOB")(path))
+                select_query = select_query.where(where)
         else:
             parent = self.get_node_by_path(dr, path.lstrip("/").rstrip("/*"))
-            select_query = select_query.where(dr.c.parent == parent.path)
+            select_query = select_query.where(
+                (dr.c.parent == parent.path) | (self.path_expr(dr) == path)
+            )
         return select_query
     def rename_dataset_table(
@@ -493,7 +498,10 @@ class AbstractWarehouse(ABC, Serializable):
         This gets nodes based on the provided query, and should be used sparingly,
         as it will be slow on any OLAP database systems.
         """
-        return (Node(*row) for row in self.db.execute(query))
+        columns = [c.name for c in query.columns]
+        for row in self.db.execute(query):
+            d = dict(zip(columns, row))
+            yield Node(**d)
     def get_dirs_by_parent_path(
         self,
@@ -570,14 +578,12 @@ class AbstractWarehouse(ABC, Serializable):
         matched_paths: list[list[str]] = [[]]
         for curr_name in path_list[:-1]:
             if glob.has_magic(curr_name):
-                new_paths = []
+                new_paths: list[list[str]] = []
                 for path in matched_paths:
                     nodes = self._get_nodes_by_glob_path_pattern(
                         dataset_rows, path, curr_name
                     )
-                    for node in nodes:
-                        if node.is_container:
-                            new_paths.append([*path, node.name or ""])
+                    new_paths.extend([*path, n.name] for n in nodes if n.is_container)
                 matched_paths = new_paths
             else:
                 for path in matched_paths:
@@ -772,7 +778,7 @@ class AbstractWarehouse(ABC, Serializable):
         self,
         dataset_rows: "DataTable",
         parent_path: str,
-        fields: Optional[Iterable[str]] = None,
+        fields: Optional[Sequence[str]] = None,
         type: Optional[str] = None,
         conds=None,
         order_by: Optional[Union[str, list[str]]] = None,
@@ -794,9 +800,9 @@ class AbstractWarehouse(ABC, Serializable):
         else:
             conds.append(path != "")
-        if fields is None:
-            fields = [c.name for c in dr.file_columns()]
-        columns = [getattr(q.c, f) for f in fields]
+        columns = q.c
+        if fields:
+            columns = [getattr(columns, f) for f in fields]
         query = sa.select(*columns)
         query = query.where(*conds)
@@ -833,19 +839,16 @@ class AbstractWarehouse(ABC, Serializable):
         prefix_len = len(node.path)
-        def make_node_with_path(row):
-            sub_node = Node(*row)
-            return NodeWithPath(
-                sub_node, sub_node.path[prefix_len:].lstrip("/").split("/")
-            )
+        def make_node_with_path(node: Node) -> NodeWithPath:
+            return NodeWithPath(node, node.path[prefix_len:].lstrip("/").split("/"))
-        return map(make_node_with_path, self.db.execute(query))
+        return map(make_node_with_path, self.get_nodes(query))
     def find(
         self,
         dataset_rows: "DataTable",
         node: Node,
-        fields: Iterable[str],
+        fields: Sequence[str],
         type=None,
         conds=None,
         order_by=None,
@@ -890,11 +893,9 @@ class AbstractWarehouse(ABC, Serializable):
     def is_temp_table_name(self, name: str) -> bool:
         """Returns if the given table name refers to a temporary
         or no longer needed table."""
-        if name.startswith(
+        return name.startswith(
             (self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX, "ds_shadow_")
-        ) or name.endswith("_shadow"):
-            return True
-        return False
+        ) or name.endswith("_shadow")
     def get_temp_table_names(self) -> list[str]:
         return [

datachain/dataset.py CHANGED Viewed

@@ -405,9 +405,7 @@ class DatasetRecord:
         Checks if a number can be a valid next latest version for dataset.
         The only rule is that it cannot be lower than current latest version
         """
-        if self.latest_version and self.latest_version >= version:
-            return False
-        return True
+        return not (self.latest_version and self.latest_version >= version)
     def get_version(self, version: int) -> DatasetVersion:
         if not self.has_version(version):

datachain/lib/arrow.py ADDED Viewed

@@ -0,0 +1,85 @@
+import re
+from typing import TYPE_CHECKING, Optional
+from pyarrow.dataset import dataset
+from datachain.lib.feature import Feature
+from datachain.lib.file import File
+if TYPE_CHECKING:
+    import pyarrow as pa
+class Source(Feature):
+    """File source info for tables."""
+    file: File
+    index: int
+class ArrowGenerator:
+    def __init__(self, schema: Optional["pa.Schema"] = None, **kwargs):
+        """
+        Generator for getting rows from tabular files.
+        Parameters:
+        schema : Optional pyarrow schema for validation.
+        kwargs: Parameters to pass to pyarrow.dataset.dataset.
+        """
+        self.schema = schema
+        self.kwargs = kwargs
+    def __call__(self, file: File):
+        path = file.get_path()
+        ds = dataset(path, filesystem=file.get_fs(), schema=self.schema, **self.kwargs)
+        index = 0
+        for record_batch in ds.to_batches():
+            for record in record_batch.to_pylist():
+                source = Source(file=file, index=index)
+                yield [source, *record.values()]
+                index += 1
+def schema_to_output(schema: "pa.Schema"):
+    """Generate UDF output schema from pyarrow schema."""
+    default_column = 0
+    output = {"source": Source}
+    for field in schema:
+        column = field.name.lower()
+        column = re.sub("[^0-9a-z_]+", "", column)
+        if not column:
+            column = f"c{default_column}"
+            default_column += 1
+        output[column] = _arrow_type_mapper(field.type)  # type: ignore[assignment]
+    return output
+def _arrow_type_mapper(col_type: "pa.DataType") -> type:  # noqa: PLR0911
+    """Convert pyarrow types to basic types."""
+    from datetime import datetime
+    import pyarrow as pa
+    if pa.types.is_timestamp(col_type):
+        return datetime
+    if pa.types.is_binary(col_type):
+        return bytes
+    if pa.types.is_floating(col_type):
+        return float
+    if pa.types.is_integer(col_type):
+        return int
+    if pa.types.is_boolean(col_type):
+        return bool
+    if pa.types.is_date(col_type):
+        return datetime
+    if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
+        return str
+    if pa.types.is_list(col_type):
+        return list[_arrow_type_mapper(col_type.value_type)]  # type: ignore[misc]
+    if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
+        return dict
+    if isinstance(col_type, pa.lib.DictionaryType):
+        return _arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
+    raise TypeError(f"{col_type!r} datatypes not supported")

datachain 0.1.12__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

datachain 0.1.12py3-none-any.whl → 0.2.0py3-none-any.whl