PyPI - datachain - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

datachain 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (30) hide show

datachain/catalog/catalog.py +30 -6
datachain/data_storage/db_engine.py +0 -2
datachain/data_storage/schema.py +10 -27
datachain/data_storage/warehouse.py +1 -7
datachain/lib/arrow.py +7 -13
datachain/lib/clip.py +151 -0
datachain/lib/dc.py +35 -57
datachain/lib/feature_utils.py +1 -2
datachain/lib/file.py +7 -0
datachain/lib/image.py +37 -79
datachain/lib/pytorch.py +4 -2
datachain/lib/signal_schema.py +2 -47
datachain/lib/text.py +18 -49
datachain/lib/udf.py +58 -30
datachain/lib/udf_signature.py +11 -10
datachain/lib/utils.py +17 -0
datachain/lib/webdataset.py +2 -2
datachain/listing.py +0 -3
datachain/query/dataset.py +63 -37
datachain/query/dispatch.py +2 -2
datachain/query/schema.py +1 -8
datachain/query/udf.py +16 -18
datachain/utils.py +28 -0
{datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/METADATA +2 -1
{datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/RECORD +29 -29
{datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/WHEEL +1 -1
datachain/lib/reader.py +0 -49
{datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/LICENSE +0 -0
{datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/entry_points.txt +0 -0
{datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/top_level.txt +0 -0

datachain/lib/image.py CHANGED Viewed

@@ -1,6 +1,5 @@
-import inspect
 from io import BytesIO
-from typing import Any, Callable, Optional
+from typing import Callable, Optional, Union
 from datachain.lib.file import File
@@ -14,8 +13,6 @@ except ImportError as exc:
         "  pip install 'datachain[cv]'\n"
     ) from exc
-from datachain.lib.reader import FeatureReader
 class ImageFile(File):
     def get_value(self):
@@ -28,8 +25,8 @@ def convert_image(
     mode: str = "RGB",
     size: Optional[tuple[int, int]] = None,
     transform: Optional[Callable] = None,
-    open_clip_model: Optional[Any] = None,
-):
+    encoder: Optional[Callable] = None,
+) -> Union[Image.Image, torch.Tensor]:
     """
     Resize, transform, and otherwise convert an image.
@@ -37,8 +34,8 @@ def convert_image(
         img (Image): PIL.Image object.
         mode (str): PIL.Image mode.
         size (tuple[int, int]): Size in (width, height) pixels for resizing.
-        transform (Callable): Torchvision v1 or other transform to apply.
-        open_clip_model (Any): Encode image using model from open_clip library.
+        transform (Callable): Torchvision transform or huggingface processor to apply.
+        encoder (Callable): Encode image using model.
     """
     if mode:
         img = img.convert(mode)
@@ -46,86 +43,47 @@ def convert_image(
         img = img.resize(size)
     if transform:
         img = transform(img)
-        if open_clip_model:
+        try:
+            from transformers.image_processing_utils import BaseImageProcessor
+            if isinstance(transform, BaseImageProcessor):
+                img = torch.tensor(img.pixel_values[0])  # type: ignore[assignment,attr-defined]
+        except ImportError:
+            pass
+        if encoder:
             img = img.unsqueeze(0)  # type: ignore[attr-defined]
-    if open_clip_model:
-        method_name = "encode_image"
-        if not (
-            hasattr(open_clip_model, method_name)
-            and inspect.ismethod(getattr(open_clip_model, method_name))
-        ):
-            raise ValueError(
-                "Unable to render Image: 'open_clip_model' doesn't support"
-                f" '{method_name}()'"
-            )
-        img = open_clip_model.encode_image(img)
+    if encoder:
+        img = encoder(img)
     return img
-class ImageReader(FeatureReader):
-    def __init__(
-        self,
-        mode: str = "RGB",
-        size: Optional[tuple[int, int]] = None,
-        transform: Optional[Callable] = None,
-        open_clip_model: Any = None,
-    ):
-        """
-        Read and optionally transform an image.
-        All kwargs are passed to `convert_image()`.
-        """
-        self.mode = mode
-        self.size = size
-        self.transform = transform
-        self.open_clip_model = open_clip_model
-        super().__init__(ImageFile)
-    def __call__(self, img: Image.Image):
-        return convert_image(
-            img,
-            mode=self.mode,
-            size=self.size,
-            transform=self.transform,
-            open_clip_model=self.open_clip_model,
-        )
-def similarity_scores(
-    model: Any,
-    preprocess: Callable,
-    tokenizer: Callable,
-    image: Image.Image,
-    text: str,
-    prob: bool = False,
-) -> list[float]:
+def convert_images(
+    images: Union[Image.Image, list[Image.Image]],
+    mode: str = "RGB",
+    size: Optional[tuple[int, int]] = None,
+    transform: Optional[Callable] = None,
+    encoder: Optional[Callable] = None,
+) -> Union[list[Image.Image], torch.Tensor]:
     """
-    Calculate CLIP similarity scores for one or more texts given an image.
+    Resize, transform, and otherwise convert one or more images.
     Args:
-        model: Model from clip or open_clip packages.
-        preprocess: Image preprocessing transforms.
-        tokenizer: Text tokenizer.
-        image: Image.
-        text: Text.
-        prob: Compute softmax probabilities across texts.
+        img (Image, list[Image]): PIL.Image object or list of objects.
+        mode (str): PIL.Image mode.
+        size (tuple[int, int]): Size in (width, height) pixels for resizing.
+        transform (Callable): Torchvision transform or huggingface processor to apply.
+        encoder (Callable): Encode image using model.
     """
+    if isinstance(images, Image.Image):
+        images = [images]
-    with torch.no_grad():
-        image = preprocess(image).unsqueeze(0)
-        text = tokenizer(text)
-        image_features = model.encode_image(image)
-        text_features = model.encode_text(text)
-        image_features /= image_features.norm(dim=-1, keepdim=True)
-        text_features /= text_features.norm(dim=-1, keepdim=True)
+    converted = [convert_image(img, mode, size, transform) for img in images]
-        logits_per_text = 100.0 * image_features @ text_features.T
+    if isinstance(converted[0], torch.Tensor):
+        converted = torch.stack(converted)  # type: ignore[assignment,arg-type]
-        if prob:
-            scores = logits_per_text.softmax(dim=1)
-        else:
-            scores = logits_per_text
+    if encoder:
+        converted = encoder(converted)
-        return scores[0].tolist()
+    return converted  # type: ignore[return-value]

datachain/lib/pytorch.py CHANGED Viewed

@@ -116,10 +116,12 @@ class PytorchDataset(IterableDataset):
                     self.transform = None
             if self.tokenizer:
                 for i, val in enumerate(row):
-                    if isinstance(val, str):
+                    if isinstance(val, str) or (
+                        isinstance(val, list) and isinstance(val[0], str)
+                    ):
                         row[i] = convert_text(
                             val, self.tokenizer, self.tokenizer_kwargs
-                        )
+                        ).squeeze(0)  # type: ignore[union-attr]
             yield row
     @staticmethod

datachain/lib/signal_schema.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Any, Optional, Union, get_args, get_origin
 from pydantic import create_model
-from datachain.lib.arrow import Source
 from datachain.lib.feature import (
     DATACHAIN_TO_TYPE,
     DEFAULT_DELIMITER,
@@ -14,17 +13,13 @@ from datachain.lib.feature import (
     convert_type_to_datachain,
 )
 from datachain.lib.feature_registry import Registry
-from datachain.lib.file import File, TextFile
-from datachain.lib.image import ImageFile
+from datachain.lib.file import File
 from datachain.lib.utils import DataChainParamsError
-from datachain.lib.webdataset import TarStream, WDSAllFile, WDSBasic
-from datachain.lib.webdataset_laion import Laion, WDSLaion
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
-# TODO fix hardcoded Feature class names with://github.com/iterative/dvcx/issues/1625
 NAMES_TO_TYPES = {
     "int": int,
     "str": str,
@@ -34,15 +29,6 @@ NAMES_TO_TYPES = {
     "dict": dict,
     "bytes": bytes,
     "datetime": datetime,
-    "WDSLaion": WDSLaion,
-    "Laion": Laion,
-    "Source": Source,
-    "File": File,
-    "ImageFile": ImageFile,
-    "TextFile": TextFile,
-    "TarStream": TarStream,
-    "WDSBasic": WDSBasic,
-    "WDSAllFile": WDSAllFile,
 }
@@ -150,7 +136,7 @@ class SignalSchema:
         )
     def slice(self, keys: Sequence[str]) -> "SignalSchema":
-        return SignalSchema({k: v for k, v in self.values.items() if k in keys})
+        return SignalSchema({k: self.values[k] for k in keys if k in self.values})
     def row_to_features(self, row: Sequence, catalog: "Catalog") -> list[FeatureType]:
         res = []
@@ -240,37 +226,6 @@ class SignalSchema:
             if has_subtree and issubclass(type_, File):
                 yield ".".join(path)
-    def get_file_signals_values(self, row: dict[str, Any]) -> dict[str, Any]:
-        """
-        Method that returns values with clean field names (without prefix) for
-        all file signals found in this schema for some row
-        Output example:
-        {
-            laion.file: {
-                "source": "s3://ldb-public",
-                "name": "dog.jpg",
-                ...
-            },
-            meta.file: {
-                "source": "s3://datacomp",
-                "name": "cat.jpg",
-                ...
-            }
-        }
-        """
-        res = {}
-        for file_signals in self.get_file_signals():
-            prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
-            res[file_signals] = {
-                c_name.removeprefix(prefix): c_value
-                for c_name, c_value in row.items()
-                if c_name.startswith(prefix)
-                and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
-            }
-        return res
     def create_model(self, name: str) -> type[Feature]:
         fields = {key: (value, None) for key, value in self.values.items()}

datachain/lib/text.py CHANGED Viewed

@@ -1,19 +1,15 @@
-import inspect
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
-from datachain.lib.file import TextFile
-from datachain.lib.reader import FeatureReader
 if TYPE_CHECKING:
-    from datachain.lib.feature_utils import FeatureLike
+    import torch
 def convert_text(
     text: Union[str, list[str]],
     tokenizer: Optional[Callable] = None,
     tokenizer_kwargs: Optional[dict[str, Any]] = None,
-    open_clip_model: Optional[Any] = None,
-):
+    encoder: Optional[Callable] = None,
+) -> Union[str, list[str], "torch.Tensor"]:
     """
     Tokenize and otherwise transform text.
@@ -21,18 +17,8 @@ def convert_text(
         text (str): Text to convert.
         tokenizer (Callable): Tokenizer to use to tokenize objects.
         tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
-        open_clip_model (Any): Encode text using model from open_clip library.
+        encoder (Callable): Encode text using model.
     """
-    if open_clip_model:
-        method_name = "encode_text"
-        if not (
-            hasattr(open_clip_model, method_name)
-            and inspect.ismethod(getattr(open_clip_model, method_name))
-        ):
-            raise ValueError(
-                f"TextColumn error: 'model' doesn't support '{method_name}()'"
-            )
     if not tokenizer:
         return text
@@ -43,38 +29,21 @@ def convert_text(
         res = tokenizer(text, **tokenizer_kwargs)
     else:
         res = tokenizer(text)
-    from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-    tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
-    if not open_clip_model:
-        return tokens.squeeze(0)
-    return open_clip_model.encode_text(tokens).squeeze(0)
+    try:
+        from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+        tokens = (
+            res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
+        )
+    except ImportError:
+        tokens = res
-class TextReader(FeatureReader):
-    def __init__(
-        self,
-        fr_class: "FeatureLike" = TextFile,
-        tokenizer: Optional[Callable] = None,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
-        open_clip_model: Optional[Any] = None,
-    ):
-        """
-        Read and optionally transform a text column.
+    if not encoder:
+        return tokens
-        All kwargs are passed to `convert_text()`.
-        """
-        self.tokenizer = tokenizer
-        self.tokenizer_kwargs = tokenizer_kwargs
-        self.open_clip_model = open_clip_model
-        super().__init__(fr_class)
+    try:
+        import torch
+    except ImportError:
+        "Missing dependency 'torch' needed to encode text."
-    def __call__(self, value: Union[str, list[str]]):
-        return convert_text(
-            value,
-            tokenizer=self.tokenizer,
-            tokenizer_kwargs=self.tokenizer_kwargs,
-            open_clip_model=self.open_clip_model,
-        )
+    return encoder(torch.tensor(tokens))

datachain/lib/udf.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import inspect
 import sys
 import traceback
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import TYPE_CHECKING, Callable
 from datachain.lib.feature import Feature
 from datachain.lib.signal_schema import SignalSchema
-from datachain.lib.utils import DataChainError, DataChainParamsError
+from datachain.lib.udf_signature import UdfSignature
+from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
 from datachain.query import udf
 if TYPE_CHECKING:
@@ -17,26 +18,68 @@ class UdfError(DataChainParamsError):
         super().__init__(f"UDF error: {msg}")
-class UDFBase:
+class UDFBase(AbstractUDF):
     is_input_batched = False
     is_output_batched = False
     is_input_grouped = False
-    def __init__(
-        self,
-        params: SignalSchema,
-        output: SignalSchema,
-        func: Optional[Callable] = None,
-    ):
+    def __init__(self):
+        self.params = None
+        self.output = None
+        self.params_spec = None
+        self.output_spec = None
+        self._contains_stream = None
+        self._catalog = None
+        self._func = None
+    def process(self, *args, **kwargs):
+        """Processing function that needs to be defined by user"""
+        if not self._func:
+            raise NotImplementedError("UDF processing is not implemented")
+        return self._func(*args, **kwargs)
+    def setup(self):
+        """Initialization process executed on each worker before processing begins.
+        This is needed for tasks like pre-loading ML models prior to scoring.
+        """
+    def teardown(self):
+        """Teardown process executed on each process/worker after processing ends.
+        This is needed for tasks like closing connections to end-points.
+        """
+    def _init(self, sign: UdfSignature, params: SignalSchema, func: Callable):
         self.params = params
-        self.output = output
-        self._func = func
+        self.output = sign.output_schema
-        params_spec = params.to_udf_spec()
+        params_spec = self.params.to_udf_spec()
         self.params_spec = list(params_spec.keys())
-        self.output_spec = output.to_udf_spec()
+        self.output_spec = self.output.to_udf_spec()
-        self._catalog = None
+        self._func = func
+    @classmethod
+    def _create(
+        cls,
+        target_class: type["UDFBase"],
+        sign: UdfSignature,
+        params: SignalSchema,
+        catalog,
+    ) -> "UDFBase":
+        if isinstance(sign.func, AbstractUDF):
+            if not isinstance(sign.func, target_class):  # type: ignore[unreachable]
+                raise UdfError(
+                    f"cannot create UDF: provided UDF '{sign.func.__name__}'"
+                    f" must be a child of target class '{target_class.__name__}'",
+                )
+            result = sign.func
+            func = None
+        else:
+            result = target_class()
+            func = sign.func
+        result._init(sign, params, func)
+        return result
     @property
     def name(self):
@@ -53,25 +96,10 @@ class UDFBase:
         udf_wrapper = udf(self.params_spec, self.output_spec, batch=batch)
         return udf_wrapper(self)
-    def bootstrap(self):
-        """Initialization process executed on each worker before processing begins.
-        This is needed for tasks like pre-loading ML models prior to scoring.
-        """
-    def teardown(self):
-        """Teardown process executed on each process/worker after processing ends.
-        This is needed for tasks like closing connections to end-points.
-        """
-    def process(self, *args, **kwargs):
-        if not self._func:
-            raise NotImplementedError("UDF processing is not implemented")
-        return self._func(*args, **kwargs)
     def validate_results(self, results, *args, **kwargs):
         return results
-    def __call__(self, *rows, **kwargs):
+    def __call__(self, *rows):
         if self.is_input_grouped:
             objs = self._parse_grouped_rows(rows)
         else:

datachain/lib/udf_signature.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Callable, Optional, Union, get_args, get_origin
 from datachain.lib.feature import Feature, FeatureType, FeatureTypeNames
 from datachain.lib.signal_schema import SignalSchema
-from datachain.lib.utils import DataChainParamsError
+from datachain.lib.utils import AbstractUDF, DataChainParamsError
 class UdfSignatureError(DataChainParamsError):
@@ -49,10 +49,13 @@ class UdfSignature:
         else:
             if func is None:
                 raise UdfSignatureError(chain, "user function is not defined")
             udf_func = func
             signal_name = None
         if not callable(udf_func):
-            raise UdfSignatureError(chain, f"function '{func}' is not callable")
+            raise UdfSignatureError(chain, f"UDF '{udf_func}' is not callable")
         func_params_map_sign, func_outs_sign, is_iterator = (
             UdfSignature._func_signature(chain, udf_func)
         )
@@ -108,13 +111,6 @@ class UdfSignature:
         if isinstance(output, str):
             output = [output]
         if isinstance(output, Sequence):
-            if not func_outs_sign:
-                raise UdfSignatureError(
-                    chain,
-                    "output types are not specified. Specify types in 'output' as"
-                    " a dict or as function return value hint.",
-                )
             if len(func_outs_sign) != len(output):
                 raise UdfSignatureError(
                     chain,
@@ -158,8 +154,13 @@ class UdfSignature:
     @staticmethod
     def _func_signature(
-        chain: str, func: Callable
+        chain: str, udf_func: Callable
     ) -> tuple[dict[str, type], Sequence[type], bool]:
+        if isinstance(udf_func, AbstractUDF):
+            func = udf_func.process  # type: ignore[unreachable]
+        else:
+            func = udf_func
         sign = inspect.signature(func)
         input_map = {prm.name: prm.annotation for prm in sign.parameters.values()}

datachain/lib/utils.py CHANGED Viewed

@@ -1,3 +1,20 @@
+from abc import ABC, abstractmethod
+class AbstractUDF(ABC):
+    @abstractmethod
+    def process(self, *args, **kwargs):
+        pass
+    @abstractmethod
+    def setup(self):
+        pass
+    @abstractmethod
+    def teardown(self):
+        pass
 class DataChainError(Exception):
     def __init__(self, message):
         super().__init__(message)

datachain/lib/webdataset.py CHANGED Viewed

@@ -2,6 +2,7 @@ import hashlib
 import json
 import tarfile
 from collections.abc import Iterator, Sequence
+from pathlib import Path
 from typing import (
     Any,
     Callable,
@@ -240,10 +241,9 @@ class TarStream(File):
 def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
     builder = Builder(stream, core_extensions, spec, tar, encoding)
-    for item in tar.getmembers():
+    for item in sorted(tar.getmembers(), key=lambda m: Path(m.name).stem):
         if not item.isfile():
             continue
         try:
             builder.add(item)
         except StopIteration:

datachain/listing.py CHANGED Viewed

@@ -20,9 +20,6 @@ if TYPE_CHECKING:
     from datachain.storage import Storage
-RANDOM_BITS = 63  # size of the random integer field
 class Listing:
     def __init__(
         self,

datachain 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

Potentially problematic release.

datachain 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl