PyPI - datachain - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

datachain 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (29) hide show

datachain/catalog/catalog.py +57 -212
datachain/cli.py +6 -38
datachain/client/fsspec.py +3 -0
datachain/client/hf.py +47 -0
datachain/data_storage/metastore.py +2 -29
datachain/data_storage/sqlite.py +3 -12
datachain/data_storage/warehouse.py +20 -29
datachain/dataset.py +44 -32
datachain/job.py +4 -3
datachain/lib/arrow.py +21 -5
datachain/lib/dataset_info.py +4 -0
datachain/lib/dc.py +183 -59
datachain/lib/file.py +10 -33
datachain/lib/hf.py +2 -1
datachain/lib/listing.py +102 -94
datachain/lib/listing_info.py +32 -0
datachain/lib/meta_formats.py +39 -56
datachain/lib/signal_schema.py +5 -2
datachain/node.py +13 -0
datachain/query/dataset.py +12 -105
datachain/query/metrics.py +8 -0
datachain/utils.py +5 -0
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/METADATA +7 -3
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/RECORD +28 -27
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/WHEEL +1 -1
datachain/catalog/subclass.py +0 -60
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/LICENSE +0 -0
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/entry_points.txt +0 -0
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/top_level.txt +0 -0

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -143,7 +143,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
             db.execute("PRAGMA synchronous = NORMAL")
             db.execute("PRAGMA case_sensitive_like = ON")
             if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
-                db.set_trace_callback(print)
+                import sys
+                db.set_trace_callback(sys.stderr.write)
             load_usearch_extension(db)
@@ -515,17 +517,6 @@ class SQLiteMetastore(AbstractDBMetastore):
     def _datasets_dependencies_insert(self) -> "Insert":
         return sqlite.insert(self._datasets_dependencies)
-    #
-    # Storages
-    #
-    def mark_storage_not_indexed(self, uri: StorageURI) -> None:
-        """
-        Mark storage as not indexed.
-        This method should be called when storage index is deleted.
-        """
-        self.db.execute(self._storages_delete().where(self._storages.c.uri == uri))
     #
     # Dataset dependencies
     #

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -218,35 +218,26 @@ class AbstractWarehouse(ABC, Serializable):
         results = None
         offset = 0
         num_yielded = 0
-        try:
-            while True:
-                if limit is not None:
-                    limit -= num_yielded
-                    if limit == 0:
-                        break
-                    if limit < page_size:
-                        paginated_query = paginated_query.limit(None).limit(limit)
-                results = self.dataset_rows_select(paginated_query.offset(offset))
-                processed = False
-                for row in results:
-                    processed = True
-                    yield row
-                    num_yielded += 1
-                if not processed:
-                    break  # no more results
-                offset += page_size
-        finally:
-            # https://www2.sqlite.org/cvstrac/wiki?p=DatabaseIsLocked (SELECT not
-            # finalized or reset) to prevent database table is locked error when an
-            # exception is raised in the middle of processing the results (e.g.
-            # https://github.com/iterative/dvcx/issues/924). Connections close
-            # apparently is not enough in some cases, at least on sqlite
-            # https://www.sqlite.org/c3ref/close.html
-            if results and hasattr(results, "close"):
-                results.close()
+        while True:
+            if limit is not None:
+                limit -= num_yielded
+                if limit == 0:
+                    break
+                if limit < page_size:
+                    paginated_query = paginated_query.limit(None).limit(limit)
+            results = self.dataset_rows_select(paginated_query.offset(offset))
+            processed = False
+            for row in results:
+                processed = True
+                yield row
+                num_yielded += 1
+            if not processed:
+                break  # no more results
+            offset += page_size
     #
     # Table Name Internal Functions

datachain/dataset.py CHANGED Viewed

@@ -11,8 +11,6 @@ from typing import (
 )
 from urllib.parse import urlparse
-from dateutil.parser import isoparse
 from datachain.client import Client
 from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
@@ -25,6 +23,7 @@ DD = TypeVar("DD", bound="DatasetDependency")
 DATASET_PREFIX = "ds://"
 QUERY_DATASET_PREFIX = "ds_query_"
+LISTING_PREFIX = "lst__"
 def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
@@ -72,11 +71,22 @@ class DatasetDependencyType:
 class DatasetDependency:
     id: int
     type: str
-    name: str  # when the type is STORAGE, this is actually StorageURI
-    version: str  # string until we'll have proper bucket listing versions
+    name: str
+    version: str  # TODO change to int
     created_at: datetime
     dependencies: list[Optional["DatasetDependency"]]
+    @property
+    def dataset_name(self) -> str:
+        """Returns clean dependency dataset name"""
+        from datachain.lib.listing import parse_listing_uri
+        if self.type == DatasetDependencyType.DATASET:
+            return self.name
+        list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
+        return list_dataset_name
     @classmethod
     def parse(
         cls: builtins.type[DD],
@@ -91,33 +101,31 @@ class DatasetDependency:
         dataset_version_created_at: Optional[datetime],
         bucket_uri: Optional["StorageURI"],
     ) -> Optional["DatasetDependency"]:
-        if dataset_id:
-            assert dataset_name is not None
-            return cls(
-                id,
-                DatasetDependencyType.DATASET,
-                dataset_name,
-                (
-                    str(dataset_version)  # type: ignore[arg-type]
-                    if dataset_version
-                    else None
-                ),
-                dataset_version_created_at or dataset_created_at,  # type: ignore[arg-type]
-                [],
-            )
-        if bucket_uri:
-            return cls(
-                id,
-                DatasetDependencyType.STORAGE,
-                bucket_uri,
-                bucket_version,  # type: ignore[arg-type]
-                isoparse(bucket_version),  # type: ignore[arg-type]
-                [],
-            )
-        # dependency has been removed
-        # TODO we should introduce flags for removed datasets, instead of
-        # removing them from tables so that we can still have references
-        return None
+        from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
+        if not dataset_id:
+            return None
+        assert dataset_name is not None
+        dependency_type = DatasetDependencyType.DATASET
+        dependency_name = dataset_name
+        if is_listing_dataset(dataset_name):
+            dependency_type = DatasetDependencyType.STORAGE  # type: ignore[arg-type]
+            dependency_name = listing_uri_from_name(dataset_name)
+        return cls(
+            id,
+            dependency_type,
+            dependency_name,
+            (
+                str(dataset_version)  # type: ignore[arg-type]
+                if dataset_version
+                else None
+            ),
+            dataset_version_created_at or dataset_created_at,  # type: ignore[arg-type]
+            [],
+        )
     @property
     def is_dataset(self) -> bool:
@@ -443,7 +451,11 @@ class DatasetRecord:
         For bucket listing we implicitly create underlying dataset to hold data. This
         method is checking if this is one of those datasets.
         """
-        return Client.is_data_source_uri(self.name)
+        # TODO refactor and maybe remove method in
+        # https://github.com/iterative/datachain/issues/318
+        return Client.is_data_source_uri(self.name) or self.name.startswith(
+            LISTING_PREFIX
+        )
     @property
     def versions_values(self) -> list[int]:

datachain/job.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
+import uuid
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Optional, TypeVar
+from typing import Any, Optional, TypeVar, Union
 J = TypeVar("J", bound="Job")
@@ -25,7 +26,7 @@ class Job:
     @classmethod
     def parse(
         cls: type[J],
-        id: str,
+        id: Union[str, uuid.UUID],
         name: str,
         status: int,
         created_at: datetime,
@@ -40,7 +41,7 @@ class Job:
         metrics: str,
     ) -> "Job":
         return cls(
-            id,
+            str(id),
             name,
             status,
             created_at,

datachain/lib/arrow.py CHANGED Viewed

@@ -7,7 +7,9 @@ import pyarrow as pa
 from pyarrow.dataset import dataset
 from tqdm import tqdm
+from datachain.lib.data_model import dict_to_data_model
 from datachain.lib.file import File, IndexedFile
+from datachain.lib.model_store import ModelStore
 from datachain.lib.udf import Generator
 if TYPE_CHECKING:
@@ -59,7 +61,13 @@ class ArrowGenerator(Generator):
                     vals = list(record.values())
                     if self.output_schema:
                         fields = self.output_schema.model_fields
-                        vals = [self.output_schema(**dict(zip(fields, vals)))]
+                        vals_dict = {}
+                        for (field, field_info), val in zip(fields.items(), vals):
+                            if ModelStore.is_pydantic(field_info.annotation):
+                                vals_dict[field] = field_info.annotation(**val)  # type: ignore[misc]
+                            else:
+                                vals_dict[field] = val
+                        vals = [self.output_schema(**vals_dict)]
                     if self.source:
                         yield [IndexedFile(file=file, index=index), *vals]
                     else:
@@ -95,15 +103,15 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
         if not column:
             column = f"c{default_column}"
             default_column += 1
-        dtype = arrow_type_mapper(field.type)  # type: ignore[assignment]
-        if field.nullable:
+        dtype = arrow_type_mapper(field.type, column)  # type: ignore[assignment]
+        if field.nullable and not ModelStore.is_pydantic(dtype):
             dtype = Optional[dtype]  # type: ignore[assignment]
         output[column] = dtype
     return output
-def arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
+def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa: PLR0911
     """Convert pyarrow types to basic types."""
     from datetime import datetime
@@ -123,7 +131,15 @@ def arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
         return str
     if pa.types.is_list(col_type):
         return list[arrow_type_mapper(col_type.value_type)]  # type: ignore[return-value, misc]
-    if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
+    if pa.types.is_struct(col_type):
+        type_dict = {}
+        for field in col_type:
+            dtype = arrow_type_mapper(field.type, field.name)
+            if field.nullable and not ModelStore.is_pydantic(dtype):
+                dtype = Optional[dtype]  # type: ignore[assignment]
+            type_dict[field.name] = dtype
+        return dict_to_data_model(column, type_dict)
+    if pa.types.is_map(col_type):
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):
         return arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]

datachain/lib/dataset_info.py CHANGED Viewed

@@ -23,6 +23,8 @@ class DatasetInfo(DataModel):
     size: Optional[int] = Field(default=None)
     params: dict[str, str] = Field(default=dict)
     metrics: dict[str, Any] = Field(default=dict)
+    error_message: str = Field(default="")
+    error_stack: str = Field(default="")
     @staticmethod
     def _validate_dict(
@@ -67,4 +69,6 @@ class DatasetInfo(DataModel):
             size=version.size,
             params=job.params if job else {},
             metrics=job.metrics if job else {},
+            error_message=version.error_message,
+            error_stack=version.error_stack,
         )

datachain 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

Potentially problematic release.

datachain 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl