PyPI - datachain - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

datachain 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show

datachain/__init__.py +3 -4
datachain/cache.py +10 -4
datachain/catalog/catalog.py +35 -15
datachain/cli.py +37 -32
datachain/data_storage/metastore.py +24 -0
datachain/data_storage/warehouse.py +3 -1
datachain/job.py +56 -0
datachain/lib/arrow.py +19 -7
datachain/lib/clip.py +89 -66
datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
datachain/lib/convert/sql_to_python.py +23 -0
datachain/lib/convert/values_to_tuples.py +51 -33
datachain/lib/data_model.py +6 -27
datachain/lib/dataset_info.py +70 -0
datachain/lib/dc.py +646 -152
datachain/lib/file.py +117 -15
datachain/lib/image.py +1 -1
datachain/lib/meta_formats.py +14 -2
datachain/lib/model_store.py +3 -2
datachain/lib/pytorch.py +10 -7
datachain/lib/signal_schema.py +39 -14
datachain/lib/text.py +2 -1
datachain/lib/udf.py +56 -5
datachain/lib/udf_signature.py +1 -1
datachain/lib/webdataset.py +4 -3
datachain/node.py +11 -8
datachain/query/dataset.py +66 -147
datachain/query/dispatch.py +15 -13
datachain/query/schema.py +2 -0
datachain/query/session.py +4 -4
datachain/sql/functions/array.py +12 -0
datachain/sql/functions/string.py +8 -0
datachain/torch/__init__.py +1 -1
datachain/utils.py +45 -0
datachain-0.2.12.dist-info/METADATA +412 -0
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/RECORD +40 -45
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/WHEEL +1 -1
datachain/lib/feature_registry.py +0 -77
datachain/lib/gpt4_vision.py +0 -97
datachain/lib/hf_image_to_text.py +0 -97
datachain/lib/hf_pipeline.py +0 -90
datachain/lib/image_transform.py +0 -103
datachain/lib/iptc_exif_xmp.py +0 -76
datachain/lib/unstructured.py +0 -41
datachain/text/__init__.py +0 -3
datachain-0.2.10.dist-info/METADATA +0 -430
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/LICENSE +0 -0
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/entry_points.txt +0 -0
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/top_level.txt +0 -0

datachain/lib/clip.py CHANGED Viewed

@@ -31,7 +31,7 @@ def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
     )
-def similarity_scores(
+def clip_similarity_scores(
     images: Union[None, "Image.Image", list["Image.Image"]],
     text: Union[None, str, list[str]],
     model: Any,
@@ -43,71 +43,91 @@ def similarity_scores(
     """
     Calculate CLIP similarity scores between one or more images and/or text.
-    Args:
-        images: Images to use as inputs.
-        text: Text to use as inputs.
-        model: Model from clip or open_clip packages.
-        preprocess: Image preprocessor to apply.
-        tokenizer: Text tokenizer.
-        prob: Compute softmax probabilities.
-        image_to_text: Whether to compute for image-to-text or text-to-image. Ignored if
-            only one of images or text provided.
-    Examples
-    --------
-    using https://github.com/openai/CLIP
-    >>> import clip
-    >>> model, preprocess = clip.load("ViT-B/32")
-    >>> similarity_scores(img, "cat", model, preprocess, clip.tokenize)
-    [[21.813]]
-    using https://github.com/mlfoundations/open_clip
-    >>> import open_clip
-    >>> model, _, preprocess = open_clip.create_model_and_transforms(
-    ...     "ViT-B-32", pretrained="laion2b_s34b_b79k"
-    ... )
-    >>> tokenizer = open_clip.get_tokenizer("ViT-B-32")
-    >>> similarity_scores(img, "cat", model, preprocess, tokenizer)
-    [[21.813]]
-    using https://huggingface.co/docs/transformers/en/model_doc/clip
-    >>> from transformers import CLIPProcessor, CLIPModel
-    >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-    >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-    >>> scores = similarity_scores(
-    ...     img, "cat", model, processor.image_processor, processor.tokenizer
-    ... )
-    [[21.813]]
-    image -> list of text
-    >>> similarity_scores(img, ["cat", "dog"], model, preprocess, tokenizer)
-    [[21.813, 35.313]]
-    list of images -> text
-    >>> similarity_scores([img1, img2], "cat", model, preprocess, tokenizer)
-    [[21.813], [83.123]]
-    list of images -> list of text
-    >>> similarity_scores([img1, img2], ["cat", "dog"], model, preprocess, tokenizer)
-    [[21.813, 35.313], [83.123, 34.843]]
-    list of images -> list of images
-    >>> similarity_scores([img1, img2], None, model, preprocess, tokenizer)
-    [[94.189, 37.092]]
-    list of text -> list of text
-    >>> similarity_scores(None, ["cat", "dog"], model, preprocess, tokenizer)
-    [[67.334, 23.588]]
-    text -> list of images
-    >>> similarity_scores([img1, img2], "cat", ..., image_to_text=False)
-    [[19.708, 19.842]]
-    show scores as softmax probabilities
-    >>> similarity_scores(img, ["cat", "dog"], ..., prob=True)
-    [[0.423, 0.577]]
+    Parameters:
+        images : Images to use as inputs.
+        text : Text to use as inputs.
+        model : Model from clip or open_clip packages.
+        preprocess : Image preprocessor to apply.
+        tokenizer : Text tokenizer.
+        prob : Compute softmax probabilities.
+        image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
+            if only one of images or text provided.
+    Example:
+        Using https://github.com/openai/CLIP
+        ```py
+        >>> import clip
+        >>> model, preprocess = clip.load("ViT-B/32")
+        >>> similarity_scores(img, "cat", model, preprocess, clip.tokenize)
+        [[21.813]]
+        ```
+        Using https://github.com/mlfoundations/open_clip
+        ```py
+        >>> import open_clip
+        >>> model, _, preprocess = open_clip.create_model_and_transforms(
+        ...     "ViT-B-32", pretrained="laion2b_s34b_b79k"
+        ... )
+        >>> tokenizer = open_clip.get_tokenizer("ViT-B-32")
+        >>> similarity_scores(img, "cat", model, preprocess, tokenizer)
+        [[21.813]]
+        ```
+        Using https://huggingface.co/docs/transformers/en/model_doc/clip
+        ```py
+        >>> from transformers import CLIPProcessor, CLIPModel
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> scores = similarity_scores(
+        ...     img, "cat", model, processor.image_processor, processor.tokenizer
+        ... )
+        [[21.813]]
+        ```
+        Image -> list of text
+        ```py
+        >>> similarity_scores(img, ["cat", "dog"], model, preprocess, tokenizer)
+        [[21.813, 35.313]]
+        ```
+        List of images -> text
+        ```py
+        >>> similarity_scores([img1, img2], "cat", model, preprocess, tokenizer)
+        [[21.813], [83.123]]
+        ```
+        List of images -> list of text
+        ```py
+        >>> similarity_scores(
+        ...     [img1, img2], ["cat", "dog"], model, preprocess, tokenizer)
+        ... )
+        [[21.813, 35.313], [83.123, 34.843]]
+        ```
+        List of images -> list of images
+        ```py
+        >>> similarity_scores([img1, img2], None, model, preprocess, tokenizer)
+        [[94.189, 37.092]]
+        ```
+        List of text -> list of text
+        ```py
+        >>> similarity_scores(None, ["cat", "dog"], model, preprocess, tokenizer)
+        [[67.334, 23.588]]
+        ```
+        Text -> list of images
+        ```py
+        >>> similarity_scores([img1, img2], "cat", ..., image_to_text=False)
+        [[19.708, 19.842]]
+        ```
+        Show scores as softmax probabilities
+        ```py
+        >>> similarity_scores(img, ["cat", "dog"], ..., prob=True)
+        [[0.423, 0.577]]
+        ```
     """
     with torch.no_grad():
@@ -144,3 +164,6 @@ def similarity_scores(
             scores = logits
         return scores.tolist()
+similarity_scores = clip_similarity_scores

datachain/lib/convert/{type_converter.py → python_to_sql.py} RENAMED Viewed

@@ -19,7 +19,7 @@ from datachain.sql.types import (
     String,
 )
-TYPE_TO_DATACHAIN = {
+PYTHON_TO_SQL = {
     int: Int64,
     str: String,
     Literal: String,
@@ -34,14 +34,14 @@ TYPE_TO_DATACHAIN = {
 }
-def convert_to_db_type(typ):  # noqa: PLR0911
+def python_to_sql(typ):  # noqa: PLR0911
     if inspect.isclass(typ):
         if issubclass(typ, SQLType):
             return typ
         if issubclass(typ, Enum):
             return str
-    res = TYPE_TO_DATACHAIN.get(typ)
+    res = PYTHON_TO_SQL.get(typ)
     if res:
         return res
@@ -59,19 +59,19 @@ def convert_to_db_type(typ):  # noqa: PLR0911
         if ModelStore.is_pydantic(args0):
             return Array(JSON())
-        next_type = convert_to_db_type(args0)
+        next_type = python_to_sql(args0)
         return Array(next_type)
     if orig is Annotated:
         # Ignoring annotations
-        return convert_to_db_type(args[0])
+        return python_to_sql(args[0])
     if inspect.isclass(orig) and issubclass(dict, orig):
         return JSON
     if orig == Union:
         if len(args) == 2 and (type(None) in args):
-            return convert_to_db_type(args[0])
+            return python_to_sql(args[0])
         if _is_json_inside_union(orig, args):
             return JSON

datachain/lib/convert/sql_to_python.py ADDED Viewed

@@ -0,0 +1,23 @@
+from datetime import datetime
+from typing import Any
+from sqlalchemy import ARRAY, JSON, Boolean, DateTime, Float, Integer, String
+from datachain.data_storage.sqlite import Column
+SQL_TO_PYTHON = {
+    String: str,
+    Integer: int,
+    Float: float,
+    Boolean: bool,
+    DateTime: datetime,
+    ARRAY: list,
+    JSON: dict,
+}
+def sql_to_python(args_map: dict[str, Column]) -> dict[str, Any]:
+    return {
+        k: SQL_TO_PYTHON.get(type(v.type), str)  # type: ignore[union-attr]
+        for k, v in args_map.items()
+    }

datachain/lib/convert/values_to_tuples.py CHANGED Viewed

@@ -9,41 +9,16 @@ class ValuesToTupleError(DataChainParamsError):
     def __init__(self, ds_name, msg):
         if ds_name:
             ds_name = f"' {ds_name}'"
-        super().__init__(f"Cannot convert features for dataset{ds_name}: {msg}")
+        super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
-def values_to_tuples(
+def values_to_tuples(  # noqa: C901, PLR0912
     ds_name: str = "",
     output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
     **fr_map,
 ) -> tuple[Any, Any, Any]:
-    types_map = {}
-    length = -1
-    for k, v in fr_map.items():
-        if not isinstance(v, Sequence) or isinstance(v, str):
-            raise ValuesToTupleError(ds_name, f"features '{k}' is not a sequence")
-        len_ = len(v)
-        if len_ == 0:
-            raise ValuesToTupleError(ds_name, f"feature '{k}' is empty list")
-        if length < 0:
-            length = len_
-        elif length != len_:
-            raise ValuesToTupleError(
-                ds_name,
-                f"feature '{k}' should have length {length} while {len_} is given",
-            )
-        typ = type(v[0])
-        if not is_chain_type(typ):
-            raise ValuesToTupleError(
-                ds_name,
-                f"feature '{k}' has unsupported type '{typ.__name__}'."
-                f" Please use Feature types: {DataTypeNames}",
-            )
-        types_map[k] = typ
     if output:
-        if not isinstance(output, Sequence) and not isinstance(output, str):
+        if not isinstance(output, (Sequence, str, dict)):
             if len(fr_map) != 1:
                 raise ValuesToTupleError(
                     ds_name,
@@ -58,21 +33,64 @@ def values_to_tuples(
             key: str = next(iter(fr_map.keys()))
             output = {key: output}  # type: ignore[dict-item]
+        if not isinstance(output, dict):
+            raise ValuesToTupleError(
+                ds_name,
+                "output type must be dict[str, DataType] while "
+                f"'{type(output).__name__}' is given",
+            )
         if len(output) != len(fr_map):
             raise ValuesToTupleError(
                 ds_name,
                 f"number of outputs '{len(output)}' should match"
-                f" number of features '{len(fr_map)}'",
+                f" number of signals '{len(fr_map)}'",
             )
-        if isinstance(output, dict):
+    types_map = {}
+    length = -1
+    for k, v in fr_map.items():
+        if not isinstance(v, Sequence) or isinstance(v, str):
+            raise ValuesToTupleError(ds_name, f"signals '{k}' is not a sequence")
+        len_ = len(v)
+        if output:
+            if k not in output:  # type: ignore[operator]
+                raise ValuesToTupleError(
+                    ds_name,
+                    f"signal '{k}' is not present in the output",
+                )
+        else:
+            if len_ == 0:
+                raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")
+            typ = type(v[0])
+            if not is_chain_type(typ):
+                raise ValuesToTupleError(
+                    ds_name,
+                    f"signal '{k}' has unsupported type '{typ.__name__}'."
+                    f" Please use DataModel types: {DataTypeNames}",
+                )
+            types_map[k] = typ
+        if length < 0:
+            length = len_
+        elif length != len_:
             raise ValuesToTupleError(
                 ds_name,
-                "output type must be dict[str, FeatureType] while "
-                f"'{type(output).__name__}' is given",
+                f"signal '{k}' should have length {length} while {len_} is given",
             )
-    else:
+    if not output:
         output = types_map  # type: ignore[assignment]
+    if not output:
+        raise ValuesToTupleError(
+            ds_name,
+            "output type must be dict[str, DataType] while empty is given"
+            " and no signals are provided",
+        )
     output_types: list[type] = list(output.values())  # type: ignore[union-attr,call-arg,arg-type]
     if len(output) > 1:  # type: ignore[arg-type]
         tuple_type = tuple(output_types)

datachain/lib/data_model.py CHANGED Viewed

@@ -1,14 +1,11 @@
 from collections.abc import Sequence
 from datetime import datetime
-from typing import TYPE_CHECKING, ClassVar, Union, get_args, get_origin
+from typing import ClassVar, Union, get_args, get_origin
 from pydantic import BaseModel
 from datachain.lib.model_store import ModelStore
-if TYPE_CHECKING:
-    from datachain.catalog import Catalog
 StandardType = Union[
     type[int],
     type[str],
@@ -24,18 +21,14 @@ DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
 class DataModel(BaseModel):
-    _version: ClassVar[int] = 1
+    """Pydantic model wrapper that registers model with `DataChain`."""
-    def get_value(self):
-        """Getting value from data. It's used in conjunction with method that operate
-        with raw data such as to_pytorch(). In contrast to method that operated with
-        data structures such as pydantic"""
-        return
+    _version: ClassVar[int] = 1
     @classmethod
     def __pydantic_init_subclass__(cls):
         """It automatically registers every declared DataModel child class."""
-        ModelStore.add(cls)
+        ModelStore.register(cls)
     @staticmethod
     def register(models: Union[DataType, Sequence[DataType]]):
@@ -44,25 +37,11 @@ class DataModel(BaseModel):
         if not isinstance(models, Sequence):
             models = [models]
         for val in models:
-            ModelStore.add(val)
-class FileBasic(DataModel):
-    def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
-        pass
-    def open(self):
-        raise NotImplementedError
-    def read(self):
-        with self.open() as stream:
-            return stream.read()
-    def get_value(self):
-        return self.read()
+            ModelStore.register(val)
 def is_chain_type(t: type) -> bool:
+    """Return true if type is supported by `DataChain`."""
     if ModelStore.is_pydantic(t):
         return True
     if any(t is ft or t is get_args(ft)[0] for ft in get_args(StandardType)):

datachain/lib/dataset_info.py ADDED Viewed

@@ -0,0 +1,70 @@
+import json
+from datetime import datetime
+from typing import TYPE_CHECKING, Any, Optional, Union
+from pydantic import Field, field_validator
+from datachain.dataset import DatasetRecord, DatasetStatus, DatasetVersion
+from datachain.job import Job
+from datachain.lib.data_model import DataModel
+from datachain.utils import TIME_ZERO
+if TYPE_CHECKING:
+    from typing_extensions import Self
+class DatasetInfo(DataModel):
+    name: str
+    version: int = Field(default=1)
+    status: int = Field(default=DatasetStatus.CREATED)
+    created_at: datetime = Field(default=TIME_ZERO)
+    finished_at: Optional[datetime] = Field(default=None)
+    num_objects: Optional[int] = Field(default=None)
+    size: Optional[int] = Field(default=None)
+    params: dict[str, str] = Field(default=dict)
+    metrics: dict[str, Any] = Field(default=dict)
+    @staticmethod
+    def _validate_dict(
+        v: Optional[Union[str, dict]],
+    ) -> dict:
+        if v is None or v == "":
+            return {}
+        if isinstance(v, str):
+            try:
+                return json.loads(v)
+            except Exception as e:  # noqa: BLE001
+                raise ValueError(
+                    f"Unable to convert string '{v}' to dict for Dataset feature: {e}"
+                ) from None
+        return v
+    # Workaround for empty JSONs converted to empty strings in some DBs.
+    @field_validator("params", mode="before")
+    @classmethod
+    def validate_location(cls, v):
+        return cls._validate_dict(v)
+    @field_validator("metrics", mode="before")
+    @classmethod
+    def validate_metrics(cls, v):
+        return cls._validate_dict(v)
+    @classmethod
+    def from_models(
+        cls,
+        dataset: DatasetRecord,
+        version: DatasetVersion,
+        job: Optional[Job],
+    ) -> "Self":
+        return cls(
+            name=dataset.name,
+            version=version.version,
+            status=version.status,
+            created_at=version.created_at,
+            finished_at=version.finished_at,
+            num_objects=version.num_objects,
+            size=version.size,
+            params=job.params if job else {},
+            metrics=job.metrics if job else {},
+        )

datachain 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

Potentially problematic release.

datachain 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl