PyPI - datachain - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

datachain 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (39) hide show

datachain/__init__.py +0 -4
datachain/catalog/catalog.py +17 -2
datachain/cli.py +8 -1
datachain/data_storage/db_engine.py +0 -2
datachain/data_storage/schema.py +15 -26
datachain/data_storage/sqlite.py +3 -0
datachain/data_storage/warehouse.py +1 -7
datachain/lib/arrow.py +7 -13
datachain/lib/cached_stream.py +3 -85
datachain/lib/clip.py +151 -0
datachain/lib/dc.py +41 -59
datachain/lib/feature.py +5 -1
datachain/lib/feature_registry.py +3 -2
datachain/lib/feature_utils.py +1 -2
datachain/lib/file.py +17 -24
datachain/lib/image.py +37 -79
datachain/lib/pytorch.py +4 -2
datachain/lib/signal_schema.py +3 -4
datachain/lib/text.py +18 -49
datachain/lib/udf.py +64 -55
datachain/lib/udf_signature.py +11 -10
datachain/lib/utils.py +17 -0
datachain/lib/webdataset.py +2 -2
datachain/listing.py +0 -3
datachain/query/dataset.py +66 -46
datachain/query/dispatch.py +2 -2
datachain/query/schema.py +1 -8
datachain/query/udf.py +16 -18
datachain/sql/sqlite/base.py +34 -2
datachain/sql/sqlite/vector.py +13 -5
datachain/utils.py +28 -0
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/METADATA +3 -2
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/RECORD +37 -38
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/WHEEL +1 -1
datachain/_version.py +0 -16
datachain/lib/reader.py +0 -49
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/LICENSE +0 -0
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/entry_points.txt +0 -0
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/top_level.txt +0 -0

datachain/lib/dc.py CHANGED Viewed

@@ -14,7 +14,7 @@ import sqlalchemy
 from datachain.lib.feature import Feature, FeatureType
 from datachain.lib.feature_utils import features_to_tuples
-from datachain.lib.file import File, get_file
+from datachain.lib.file import File, IndexedFile, get_file
 from datachain.lib.meta_formats import read_meta, read_schema
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
@@ -39,6 +39,8 @@ if TYPE_CHECKING:
     import pandas as pd
     from typing_extensions import Self
+    from datachain.catalog import Catalog
 C = Column
@@ -200,10 +202,12 @@ class DataChain(DatasetQuery):
     def from_storage(
         cls,
         path,
+        *,
         type: Literal["binary", "text", "image"] = "binary",
+        catalog: Optional["Catalog"] = None,
         recursive: Optional[bool] = True,
         anon: bool = False,
-    ) -> "DataChain":
+    ) -> "Self":
         """Get data from a storage as a list of file with all file attributes. It
         returns the chain itself as usual.
@@ -220,7 +224,7 @@ class DataChain(DatasetQuery):
             ```
         """
         func = get_file(type)
-        return DataChain(path, recursive=recursive, anon=anon).map(file=func)
+        return cls(path, catalog=catalog, recursive=recursive, anon=anon).map(file=func)
     @classmethod
     def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
@@ -433,8 +437,7 @@ class DataChain(DatasetQuery):
         udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
-        chain = DatasetQuery.add_signals(
-            self,
+        chain = self.add_signals(
             udf_obj.to_udf_wrapper(self._settings.batch),
             **self._settings.to_dict(),
         )
@@ -530,23 +533,23 @@ class DataChain(DatasetQuery):
         signal_map,
     ) -> UDFBase:
         is_generator = target_class.is_output_batched
-        name = self.name or "Unknown"
+        name = self.name or ""
         sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
+        params_schema = self.signals_schema.slice(sign.params)
-        params_feature = self.signals_schema.slice(sign.params)
-        udf = target_class(params_feature, sign.output_schema, func=sign.func)
-        udf.set_catalog(self.catalog)
-        return udf
+        return UDFBase._create(target_class, sign, params_schema, self.catalog)
     def _extend_features(self, method_name, *args, **kwargs):
         super_func = getattr(super(), method_name)
         new_schema = self.signals_schema.resolve(*args)
-        columns = new_schema.db_signals()
-        chain = super_func(*columns, **kwargs)
-        chain.signals_schema = new_schema
+        columns = [C(col) for col in new_schema.db_signals()]
+        res = super_func(*columns, **kwargs)
+        if isinstance(res, DataChain):
+            res.signals_schema = new_schema
-        return chain
+        return res
     @detach
     def select(self, *args: str) -> "Self":
@@ -699,6 +702,9 @@ class DataChain(DatasetQuery):
             right_on = on
             right_on_columns = on_columns
+        if self == right_ds:
+            right_ds = right_ds.clone(new_table=True)
         ops = [
             self.c(left) == right_ds.c(right)
             for left, right in zip(on_columns, right_on_columns)
@@ -774,11 +780,11 @@ class DataChain(DatasetQuery):
         from pyarrow import unify_schemas
         from pyarrow.dataset import dataset
-        from datachain.lib.arrow import ArrowGenerator, Source, schema_to_output
+        from datachain.lib.arrow import ArrowGenerator, schema_to_output
         schema = None
         if output:
-            output = {"source": Source} | output
+            output = {"source": IndexedFile} | output
         else:
             schemas = []
             for row in self.select("file").iterate():
@@ -791,7 +797,6 @@ class DataChain(DatasetQuery):
             schema = unify_schemas(schemas)
             try:
                 output = schema_to_output(schema)
-                print(f"Inferred tabular data schema: {output}")
             except ValueError as e:
                 raise DatasetPrepareError(self.name, e) from e
@@ -893,15 +898,26 @@ class DataChain(DatasetQuery):
             >>> single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
         """
         session = Session.get(session)
-        dsr = cls.create_empty_record(session=session)
-        if to_insert is not None:
-            if not isinstance(to_insert, list):
-                to_insert = [to_insert]
-            for record in to_insert:
-                cls.insert_record(dsr, record, session=session)
+        catalog = session.catalog
+        name = session.generate_temp_dataset_name()
+        columns: tuple[sqlalchemy.Column[Any], ...] = tuple(
+            sqlalchemy.Column(name, typ)
+            for name, typ in File._datachain_column_types.items()
+        )
+        dsr = catalog.create_dataset(name, columns=columns)
+        if isinstance(to_insert, dict):
+            to_insert = [to_insert]
+        elif not to_insert:
+            to_insert = []
+        warehouse = catalog.warehouse
+        dr = warehouse.dataset_rows(dsr)
+        db = warehouse.db
+        insert_q = dr.get_table().insert()
+        for record in to_insert:
+            db.execute(insert_q.values(**record))
         return DataChain(name=dsr.name)
     def sum(self, fr: FeatureType):  # type: ignore[override]
@@ -915,37 +931,3 @@ class DataChain(DatasetQuery):
     def max(self, fr: FeatureType):  # type: ignore[override]
         return self._extend_features("max", fr)
-    @detach
-    def gen_random(self) -> "DataChain":
-        from random import getrandbits
-        from datachain.data_storage.warehouse import RANDOM_BITS
-        if "random" not in self.signals_schema.values:
-            chain = self.map(random=lambda: getrandbits(RANDOM_BITS), output=int).save()
-            return chain.select_except("random")
-        return self
-    @detach
-    def shuffle(self) -> "DataChain":
-        """Return results in deterministic random order."""
-        chain = self.gen_random()
-        return DatasetQuery.shuffle(chain)
-    @detach
-    def chunk(self, index: int, total: int) -> "DataChain":
-        """Split a query into smaller chunks for e.g. parallelization.
-        Examples:
-            >>> dc = DataChain(...)
-            >>> chunk_1 = dc._chunk(0, 2)
-            >>> chunk_2 = dc._chunk(1, 2)
-        Note:
-            Bear in mind that `index` is 0-indexed but `total` isn't.
-            Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
-        """
-        chain = self.gen_random()
-        return DatasetQuery.chunk(chain, index, total)

datachain/lib/feature.py CHANGED Viewed

@@ -7,6 +7,7 @@ from datetime import datetime
 from functools import lru_cache
 from types import GenericAlias
 from typing import (
+    TYPE_CHECKING,
     Any,
     ClassVar,
     Literal,
@@ -39,6 +40,9 @@ from datachain.sql.types import (
     String,
 )
+if TYPE_CHECKING:
+    from datachain.catalog import Catalog
 FeatureStandardType = Union[
     type[int],
     type[str],
@@ -158,7 +162,7 @@ class Feature(BaseModel):
         s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
         return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
-    def _set_stream(self, catalog, stream=None, caching_enabled: bool = False) -> None:
+    def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
         pass
     @classmethod

datachain/lib/feature_registry.py CHANGED Viewed

@@ -1,6 +1,7 @@
+import logging
 from typing import Any, ClassVar, Optional
-from datachain.cli import logger
+logger = logging.getLogger(__name__)
 class Registry:
@@ -16,7 +17,7 @@ class Registry:
         version = fr._version  # type: ignore[attr-defined]
         if version in cls.reg[name]:
             full_name = f"{name}@{version}"
-            logger.warning(f"Feature {full_name} is already registered")
+            logger.warning("Feature %s is already registered", full_name)
         cls.reg[name][version] = fr
     @classmethod

datachain/lib/feature_utils.py CHANGED Viewed

@@ -11,11 +11,10 @@ from datachain.lib.feature import (
     FeatureTypeNames,
     convert_type_to_datachain,
 )
-from datachain.lib.reader import FeatureReader
 from datachain.lib.utils import DataChainParamsError
 from datachain.query.schema import Column
-FeatureLike = Union[type["Feature"], FeatureReader, Column, str]
+FeatureLike = Union[type["Feature"], Column, str]
 AUTO_FEATURE_PREFIX = "_auto_fr"
 SUFFIX_SYMBOLS = string.digits + string.ascii_lowercase

datachain/lib/file.py CHANGED Viewed

@@ -2,11 +2,10 @@ import json
 from abc import ABC, abstractmethod
 from datetime import datetime
 from pathlib import Path
-from typing import Any, ClassVar, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
 from urllib.parse import unquote, urlparse
 from urllib.request import url2pathname
-from fsspec import Callback
 from fsspec.implementations.local import LocalFileSystem
 from pydantic import Field, field_validator
@@ -18,6 +17,9 @@ from datachain.lib.utils import DataChainError
 from datachain.sql.types import JSON, Int, String
 from datachain.utils import TIME_ZERO
+if TYPE_CHECKING:
+    from datachain.catalog import Catalog
 class FileFeature(Feature):
     _is_file = True
@@ -182,26 +184,17 @@ class File(FileFeature):
     def open(self):
         if self._stream is None:
-            if self._catalog is None:
-                raise FileError(self, "stream is not set")
-            self._stream = self._open_stream()
+            raise FileError(self, "stream is not set")
         if self.location:
             return VFileRegistry.resolve(self, self.location)
         return self._stream
-    def _set_stream(
-        self, catalog=None, stream=None, caching_enabled: bool = False
-    ) -> None:
-        if self._catalog is None and catalog is None:
-            raise DataChainError(f"Cannot set file '{stream}' without catalog")
-        if catalog:
-            self._catalog = catalog
+    def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
+        self._catalog = catalog
         stream_class = PreCachedStream if caching_enabled else PreDownloadStream
-        self._stream = stream_class(stream, self.size, self._catalog, self.get_uid())
+        self._stream = stream_class(self._catalog, self.get_uid())
         self._caching_enabled = caching_enabled
     def get_uid(self) -> UniqueId:
@@ -232,11 +225,6 @@ class File(FileFeature):
     def get_uri(self):
         return f"{self.source}/{self.get_full_name()}"
-    def _open_stream(self, cache: bool = False, cb: Optional[Callback] = None):
-        client = self._catalog.get_client(self.source)
-        uid = self.get_uid()
-        return client.open_object(uid, use_cache=cache, cb=cb)
     def get_path(self) -> str:
         path = unquote(self.get_uri())
         fs = self.get_fs()
@@ -258,10 +246,8 @@ class TextFile(File):
         super().__init__(**kwargs)
         self._stream = None
-    def _set_stream(
-        self, catalog=None, stream=None, caching_enabled: bool = False
-    ) -> None:
-        super()._set_stream(catalog, stream, caching_enabled)
+    def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
+        super()._set_stream(catalog, caching_enabled)
         self._stream.set_mode("r")
@@ -296,3 +282,10 @@ def get_file(type: Literal["binary", "text", "image"] = "binary"):
         )
     return get_file_type
+class IndexedFile(Feature):
+    """File source info for tables."""
+    file: File
+    index: int

datachain/lib/image.py CHANGED Viewed

@@ -1,6 +1,5 @@
-import inspect
 from io import BytesIO
-from typing import Any, Callable, Optional
+from typing import Callable, Optional, Union
 from datachain.lib.file import File
@@ -14,8 +13,6 @@ except ImportError as exc:
         "  pip install 'datachain[cv]'\n"
     ) from exc
-from datachain.lib.reader import FeatureReader
 class ImageFile(File):
     def get_value(self):
@@ -28,8 +25,8 @@ def convert_image(
     mode: str = "RGB",
     size: Optional[tuple[int, int]] = None,
     transform: Optional[Callable] = None,
-    open_clip_model: Optional[Any] = None,
-):
+    encoder: Optional[Callable] = None,
+) -> Union[Image.Image, torch.Tensor]:
     """
     Resize, transform, and otherwise convert an image.
@@ -37,8 +34,8 @@ def convert_image(
         img (Image): PIL.Image object.
         mode (str): PIL.Image mode.
         size (tuple[int, int]): Size in (width, height) pixels for resizing.
-        transform (Callable): Torchvision v1 or other transform to apply.
-        open_clip_model (Any): Encode image using model from open_clip library.
+        transform (Callable): Torchvision transform or huggingface processor to apply.
+        encoder (Callable): Encode image using model.
     """
     if mode:
         img = img.convert(mode)
@@ -46,86 +43,47 @@ def convert_image(
         img = img.resize(size)
     if transform:
         img = transform(img)
-        if open_clip_model:
+        try:
+            from transformers.image_processing_utils import BaseImageProcessor
+            if isinstance(transform, BaseImageProcessor):
+                img = torch.tensor(img.pixel_values[0])  # type: ignore[assignment,attr-defined]
+        except ImportError:
+            pass
+        if encoder:
             img = img.unsqueeze(0)  # type: ignore[attr-defined]
-    if open_clip_model:
-        method_name = "encode_image"
-        if not (
-            hasattr(open_clip_model, method_name)
-            and inspect.ismethod(getattr(open_clip_model, method_name))
-        ):
-            raise ValueError(
-                "Unable to render Image: 'open_clip_model' doesn't support"
-                f" '{method_name}()'"
-            )
-        img = open_clip_model.encode_image(img)
+    if encoder:
+        img = encoder(img)
     return img
-class ImageReader(FeatureReader):
-    def __init__(
-        self,
-        mode: str = "RGB",
-        size: Optional[tuple[int, int]] = None,
-        transform: Optional[Callable] = None,
-        open_clip_model: Any = None,
-    ):
-        """
-        Read and optionally transform an image.
-        All kwargs are passed to `convert_image()`.
-        """
-        self.mode = mode
-        self.size = size
-        self.transform = transform
-        self.open_clip_model = open_clip_model
-        super().__init__(ImageFile)
-    def __call__(self, img: Image.Image):
-        return convert_image(
-            img,
-            mode=self.mode,
-            size=self.size,
-            transform=self.transform,
-            open_clip_model=self.open_clip_model,
-        )
-def similarity_scores(
-    model: Any,
-    preprocess: Callable,
-    tokenizer: Callable,
-    image: Image.Image,
-    text: str,
-    prob: bool = False,
-) -> list[float]:
+def convert_images(
+    images: Union[Image.Image, list[Image.Image]],
+    mode: str = "RGB",
+    size: Optional[tuple[int, int]] = None,
+    transform: Optional[Callable] = None,
+    encoder: Optional[Callable] = None,
+) -> Union[list[Image.Image], torch.Tensor]:
     """
-    Calculate CLIP similarity scores for one or more texts given an image.
+    Resize, transform, and otherwise convert one or more images.
     Args:
-        model: Model from clip or open_clip packages.
-        preprocess: Image preprocessing transforms.
-        tokenizer: Text tokenizer.
-        image: Image.
-        text: Text.
-        prob: Compute softmax probabilities across texts.
+        img (Image, list[Image]): PIL.Image object or list of objects.
+        mode (str): PIL.Image mode.
+        size (tuple[int, int]): Size in (width, height) pixels for resizing.
+        transform (Callable): Torchvision transform or huggingface processor to apply.
+        encoder (Callable): Encode image using model.
     """
+    if isinstance(images, Image.Image):
+        images = [images]
-    with torch.no_grad():
-        image = preprocess(image).unsqueeze(0)
-        text = tokenizer(text)
-        image_features = model.encode_image(image)
-        text_features = model.encode_text(text)
-        image_features /= image_features.norm(dim=-1, keepdim=True)
-        text_features /= text_features.norm(dim=-1, keepdim=True)
+    converted = [convert_image(img, mode, size, transform) for img in images]
-        logits_per_text = 100.0 * image_features @ text_features.T
+    if isinstance(converted[0], torch.Tensor):
+        converted = torch.stack(converted)  # type: ignore[assignment,arg-type]
-        if prob:
-            scores = logits_per_text.softmax(dim=1)
-        else:
-            scores = logits_per_text
+    if encoder:
+        converted = encoder(converted)
-        return scores[0].tolist()
+    return converted  # type: ignore[return-value]

datachain/lib/pytorch.py CHANGED Viewed

@@ -116,10 +116,12 @@ class PytorchDataset(IterableDataset):
                     self.transform = None
             if self.tokenizer:
                 for i, val in enumerate(row):
-                    if isinstance(val, str):
+                    if isinstance(val, str) or (
+                        isinstance(val, list) and isinstance(val[0], str)
+                    ):
                         row[i] = convert_text(
                             val, self.tokenizer, self.tokenizer_kwargs
-                        )
+                        ).squeeze(0)  # type: ignore[union-attr]
             yield row
     @staticmethod

datachain/lib/signal_schema.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Any, Optional, Union, get_args, get_origin
 from pydantic import create_model
-from datachain.lib.arrow import Source
 from datachain.lib.feature import (
     DATACHAIN_TO_TYPE,
     DEFAULT_DELIMITER,
@@ -14,7 +13,7 @@ from datachain.lib.feature import (
     convert_type_to_datachain,
 )
 from datachain.lib.feature_registry import Registry
-from datachain.lib.file import File, TextFile
+from datachain.lib.file import File, IndexedFile, TextFile
 from datachain.lib.image import ImageFile
 from datachain.lib.utils import DataChainParamsError
 from datachain.lib.webdataset import TarStream, WDSAllFile, WDSBasic
@@ -36,7 +35,7 @@ NAMES_TO_TYPES = {
     "datetime": datetime,
     "WDSLaion": WDSLaion,
     "Laion": Laion,
-    "Source": Source,
+    "Source": IndexedFile,
     "File": File,
     "ImageFile": ImageFile,
     "TextFile": TextFile,
@@ -150,7 +149,7 @@ class SignalSchema:
         )
     def slice(self, keys: Sequence[str]) -> "SignalSchema":
-        return SignalSchema({k: v for k, v in self.values.items() if k in keys})
+        return SignalSchema({k: self.values[k] for k in keys if k in self.values})
     def row_to_features(self, row: Sequence, catalog: "Catalog") -> list[FeatureType]:
         res = []

datachain/lib/text.py CHANGED Viewed

@@ -1,19 +1,15 @@
-import inspect
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
-from datachain.lib.file import TextFile
-from datachain.lib.reader import FeatureReader
 if TYPE_CHECKING:
-    from datachain.lib.feature_utils import FeatureLike
+    import torch
 def convert_text(
     text: Union[str, list[str]],
     tokenizer: Optional[Callable] = None,
     tokenizer_kwargs: Optional[dict[str, Any]] = None,
-    open_clip_model: Optional[Any] = None,
-):
+    encoder: Optional[Callable] = None,
+) -> Union[str, list[str], "torch.Tensor"]:
     """
     Tokenize and otherwise transform text.
@@ -21,18 +17,8 @@ def convert_text(
         text (str): Text to convert.
         tokenizer (Callable): Tokenizer to use to tokenize objects.
         tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
-        open_clip_model (Any): Encode text using model from open_clip library.
+        encoder (Callable): Encode text using model.
     """
-    if open_clip_model:
-        method_name = "encode_text"
-        if not (
-            hasattr(open_clip_model, method_name)
-            and inspect.ismethod(getattr(open_clip_model, method_name))
-        ):
-            raise ValueError(
-                f"TextColumn error: 'model' doesn't support '{method_name}()'"
-            )
     if not tokenizer:
         return text
@@ -43,38 +29,21 @@ def convert_text(
         res = tokenizer(text, **tokenizer_kwargs)
     else:
         res = tokenizer(text)
-    from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-    tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
-    if not open_clip_model:
-        return tokens.squeeze(0)
-    return open_clip_model.encode_text(tokens).squeeze(0)
+    try:
+        from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+        tokens = (
+            res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
+        )
+    except ImportError:
+        tokens = res
-class TextReader(FeatureReader):
-    def __init__(
-        self,
-        fr_class: "FeatureLike" = TextFile,
-        tokenizer: Optional[Callable] = None,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
-        open_clip_model: Optional[Any] = None,
-    ):
-        """
-        Read and optionally transform a text column.
+    if not encoder:
+        return tokens
-        All kwargs are passed to `convert_text()`.
-        """
-        self.tokenizer = tokenizer
-        self.tokenizer_kwargs = tokenizer_kwargs
-        self.open_clip_model = open_clip_model
-        super().__init__(fr_class)
+    try:
+        import torch
+    except ImportError:
+        "Missing dependency 'torch' needed to encode text."
-    def __call__(self, value: Union[str, list[str]]):
-        return convert_text(
-            value,
-            tokenizer=self.tokenizer,
-            tokenizer_kwargs=self.tokenizer_kwargs,
-            open_clip_model=self.open_clip_model,
-        )
+    return encoder(torch.tensor(tokens))

datachain 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

datachain 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl