PyPI - datachain - Versions diffs - 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

datachain 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show

datachain/_version.py +2 -2
datachain/asyn.py +3 -3
datachain/catalog/__init__.py +3 -3
datachain/catalog/catalog.py +6 -6
datachain/catalog/loader.py +3 -3
datachain/cli.py +2 -1
datachain/client/azure.py +37 -1
datachain/client/fsspec.py +1 -1
datachain/client/local.py +1 -1
datachain/data_storage/__init__.py +1 -1
datachain/data_storage/metastore.py +11 -3
datachain/data_storage/schema.py +2 -3
datachain/data_storage/warehouse.py +31 -30
datachain/dataset.py +1 -3
datachain/lib/arrow.py +85 -0
datachain/lib/dc.py +377 -178
datachain/lib/feature.py +41 -90
datachain/lib/feature_registry.py +3 -1
datachain/lib/feature_utils.py +2 -2
datachain/lib/file.py +20 -20
datachain/lib/image.py +9 -2
datachain/lib/meta_formats.py +66 -34
datachain/lib/settings.py +5 -5
datachain/lib/signal_schema.py +103 -105
datachain/lib/udf.py +3 -12
datachain/lib/udf_signature.py +11 -6
datachain/lib/webdataset_laion.py +5 -22
datachain/listing.py +8 -8
datachain/node.py +1 -1
datachain/progress.py +1 -1
datachain/query/builtins.py +1 -1
datachain/query/dataset.py +39 -110
datachain/query/dispatch.py +1 -1
datachain/query/metrics.py +19 -0
datachain/query/schema.py +13 -3
datachain/sql/__init__.py +1 -1
datachain/utils.py +1 -122
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
datachain/lib/parquet.py +0 -32
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0

datachain/lib/signal_schema.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import copy
-from collections.abc import Sequence
+from collections.abc import Iterator, Sequence
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Optional, Union, get_args, get_origin
+from pydantic import create_model
+from datachain.lib.arrow import Source
 from datachain.lib.feature import (
     DATACHAIN_TO_TYPE,
     DEFAULT_DELIMITER,
@@ -11,10 +14,11 @@ from datachain.lib.feature import (
     convert_type_to_datachain,
 )
 from datachain.lib.feature_registry import Registry
-from datachain.lib.file import File, ImageFile, TextFile
+from datachain.lib.file import File, TextFile
+from datachain.lib.image import ImageFile
 from datachain.lib.utils import DataChainParamsError
 from datachain.lib.webdataset import TarStream, WDSAllFile, WDSBasic
-from datachain.lib.webdataset_laion import Laion, LaionParquet, WDSLaion
+from datachain.lib.webdataset_laion import Laion, WDSLaion
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
@@ -32,7 +36,7 @@ NAMES_TO_TYPES = {
     "datetime": datetime,
     "WDSLaion": WDSLaion,
     "Laion": Laion,
-    "LaionParquet": LaionParquet,
+    "Source": Source,
     "File": File,
     "ImageFile": ImageFile,
     "TextFile": TextFile,
@@ -64,6 +68,7 @@ class SignalResolvingTypeError(SignalResolvingError):
 class SignalSchema:
     def __init__(self, values: dict[str, FeatureType]):
         self.values = values
+        self.tree = self._build_tree()
     @staticmethod
     def from_column_types(col_types: dict[str, Any]) -> "SignalSchema":
@@ -119,26 +124,10 @@ class SignalSchema:
     def to_udf_spec(self) -> dict[str, Any]:
         res = {}
-        for signal, fr_type in self.values.items():
-            signal_path = signal.split(".")
-            if Feature.is_feature(fr_type):
-                delimiter = fr_type._delimiter  # type: ignore[union-attr]
-                if fr_type._is_shallow:  # type: ignore[union-attr]
-                    signal_path = []
-                spec = fr_type._to_udf_spec()  # type: ignore[union-attr]
-                for attr, value in spec:
-                    name_path = [*signal_path, attr]
-                    res[delimiter.join(name_path)] = value
-            else:
-                delimiter = DEFAULT_DELIMITER
-                try:
-                    type_ = convert_type_to_datachain(fr_type)
-                except TypeError as err:
-                    raise SignalSchemaError(
-                        f"unsupported type '{fr_type}' for signal '{signal}'"
-                    ) from err
-                res[delimiter.join(signal_path)] = type_
+        for path, type_, has_subtree, _ in self.get_flat_tree():
+            if not has_subtree:
+                db_name = DEFAULT_DELIMITER.join(path)
+                res[db_name] = convert_type_to_datachain(type_)
         return res
     def row_to_objs(self, row: Sequence[Any]) -> list[FeatureType]:
@@ -179,35 +168,37 @@ class SignalSchema:
         return res
     def db_signals(self) -> list[str]:
-        res = []
-        for name, fr_cls in self.values.items():
-            prefixes = name.split(".")
-            if not Feature.is_feature(fr_cls):
-                res.append(DEFAULT_DELIMITER.join(prefixes))
-            else:
-                if fr_cls._is_shallow:  # type: ignore[union-attr]
-                    prefixes = []
-                spec = fr_cls._to_udf_spec()  # type: ignore[union-attr]
-                new_db_signals = [
-                    DEFAULT_DELIMITER.join([*prefixes, name]) for name, type_ in spec
-                ]
-                res.extend(new_db_signals)
-        return res
+        return [
+            DEFAULT_DELIMITER.join(path)
+            for path, _, has_subtree, _ in self.get_flat_tree()
+            if not has_subtree
+        ]
     def resolve(self, *names: str) -> "SignalSchema":
         schema = {}
-        tree = self._get_prefix_tree()
         for field in names:
             if not isinstance(field, str):
                 raise SignalResolvingTypeError("select()", field)
-            path = field.split(".")
-            cls, position = self._find_feature_in_prefix_tree(tree, path)
-            schema[field] = self._find_in_feature(cls, path, position)
+            schema[field] = self._find_in_tree(field.split("."))
         return SignalSchema(schema)
+    def _find_in_tree(self, path: list[str]) -> FeatureType:
+        curr_tree = self.tree
+        curr_type = None
+        i = 0
+        while curr_tree is not None and i < len(path):
+            if val := curr_tree.get(path[i], None):
+                curr_type, curr_tree = val
+            else:
+                curr_type = None
+            i += 1
+        if curr_type is None:
+            raise SignalResolvingError(path, "is not found")
+        return curr_type
     def select_except_signals(self, *args: str) -> "SignalSchema":
         schema = copy.deepcopy(self.values)
         for field in args:
@@ -224,59 +215,6 @@ class SignalSchema:
         return SignalSchema(schema)
-    def _get_prefix_tree(self) -> dict[str, Any]:
-        tree: dict[str, Any] = {}
-        for name, fr_cls in self.values.items():
-            prefixes = name.split(".")
-            curr_tree = {}
-            curr_prefix = ""
-            for prefix in prefixes:
-                if not curr_prefix:
-                    curr_prefix = prefix
-                    curr_tree = tree
-                else:
-                    new_tree = curr_tree.get(curr_prefix, {})  #
-                    curr_tree[curr_prefix] = new_tree
-                    curr_tree = new_tree
-                    curr_prefix = prefix
-            curr_tree[curr_prefix] = fr_cls
-        return tree
-    def _find_feature_in_prefix_tree(
-        self, tree: dict, path: list[str]
-    ) -> tuple[FeatureType, int]:
-        for i in range(len(path)):
-            prefix = path[i]
-            if prefix not in tree:
-                raise SignalResolvingError(path, f"'{prefix}' is not found")
-            val = tree[prefix]
-            if not isinstance(val, dict):
-                return val, i + 1
-            tree = val
-        next_keys = ", ".join(tree.keys())
-        raise SignalResolvingError(
-            path,
-            f"it's not a terminal value or feature, next item might be '{next_keys}'",
-        )
-    def _find_in_feature(
-        self, cls: FeatureType, path: list[str], position: int
-    ) -> FeatureType:
-        if position == len(path):
-            return cls
-        name = path[position]
-        field_info = cls.model_fields.get(name, None)  # type: ignore[union-attr]
-        if field_info is None:
-            raise SignalResolvingError(
-                path, f"field '{name}' is not found in Feature '{cls.__name__}'"
-            )
-        return self._find_in_feature(field_info.annotation, path, position + 1)  # type: ignore[arg-type]
     def clone_without_file_signals(self) -> "SignalSchema":
         schema = copy.deepcopy(self.values)
@@ -297,14 +235,10 @@ class SignalSchema:
         return SignalSchema(self.values | schema_right)
-    def get_file_signals(self) -> list[str]:
-        res = []
-        for name, fr in self.values.items():
-            if Feature.is_feature(fr):
-                signals = fr.get_file_signals([name])  # type: ignore[union-attr]
-                for signal in signals:
-                    res.append(".".join(signal))
-        return res
+    def get_file_signals(self) -> Iterator[str]:
+        for path, type_, has_subtree, _ in self.get_flat_tree():
+            if has_subtree and issubclass(type_, File):
+                yield ".".join(path)
     def get_file_signals_values(self, row: dict[str, Any]) -> dict[str, Any]:
         """
@@ -336,3 +270,67 @@ class SignalSchema:
             }
         return res
+    def create_model(self, name: str) -> type[Feature]:
+        fields = {key: (value, None) for key, value in self.values.items()}
+        return create_model(
+            name,
+            __base__=(Feature,),  # type: ignore[call-overload]
+            **fields,
+        )
+    def _build_tree(self) -> dict[str, Any]:
+        res = {}
+        for name, val in self.values.items():
+            subtree = val.build_tree() if Feature.is_feature(val) else None  # type: ignore[union-attr]
+            res[name] = (val, subtree)
+        return res
+    def get_flat_tree(self) -> Iterator[tuple[list[str], type, bool, int]]:
+        yield from self._get_flat_tree(self.tree, [], 0)
+    def _get_flat_tree(
+        self, tree: dict, prefix: list[str], depth: int
+    ) -> Iterator[tuple[list[str], type, bool, int]]:
+        for name, (type_, substree) in tree.items():
+            suffix = name.split(".")
+            new_prefix = prefix + suffix
+            has_subtree = substree is not None
+            yield new_prefix, type_, has_subtree, depth
+            if substree is not None:
+                yield from self._get_flat_tree(substree, new_prefix, depth + 1)
+    def print_tree(self, indent: int = 4, start_at: int = 0):
+        for path, type_, _, depth in self.get_flat_tree():
+            total_indent = start_at + depth * indent
+            print(" " * total_indent, f"{path[-1]}:", SignalSchema._type_to_str(type_))
+            if get_origin(type_) is list:
+                args = get_args(type_)
+                if len(args) > 0 and Feature.is_feature(args[0]):
+                    sub_schema = SignalSchema({"* list of": args[0]})
+                    sub_schema.print_tree(indent=indent, start_at=total_indent + indent)
+    @staticmethod
+    def _type_to_str(type_):
+        if get_origin(type_) == Union:
+            args = get_args(type_)
+            formatted_types = ", ".join(SignalSchema._type_to_str(arg) for arg in args)
+            return f"Union[{formatted_types}]"
+        if get_origin(type_) == Optional:
+            args = get_args(type_)
+            type_str = SignalSchema._type_to_str(args[0])
+            return f"Optional[{type_str}]"
+        if get_origin(type_) is list:
+            args = get_args(type_)
+            type_str = SignalSchema._type_to_str(args[0])
+            return f"list[{type_str}]"
+        if get_origin(type_) is dict:
+            args = get_args(type_)
+            type_str = SignalSchema._type_to_str(args[0])
+            vals = f", {SignalSchema._type_to_str(args[1])}" if len(args) > 1 else ""
+            return f"dict[{type_str}{vals}]"
+        return type_.__name__

datachain/lib/udf.py CHANGED Viewed

@@ -21,7 +21,6 @@ class UDFBase:
     is_input_batched = False
     is_output_batched = False
     is_input_grouped = False
-    is_output_single = False
     def __init__(
         self,
@@ -91,9 +90,6 @@ class UDFBase:
         if not self.is_output_batched:
             result_objs = [result_objs]
-        if self.is_output_single:
-            result_objs = [[x] for x in result_objs]
         if len(self.output.values) > 1:
             res = []
             for tuple_ in result_objs:
@@ -107,7 +103,7 @@ class UDFBase:
         else:
             # Generator expression is required, otherwise the value will be materialized
             res = (
-                Feature._flatten(obj) if isinstance(obj, Feature) else (obj,)
+                obj._flatten() if isinstance(obj, Feature) else (obj,)
                 for obj in result_objs
             )
@@ -145,9 +141,9 @@ class UDFBase:
         group = rows[0]
         spec_map = {}
         output_map = {}
-        for name, anno in self.params.values.items():
+        for name, (anno, subtree) in self.params.tree.items():
             if inspect.isclass(anno) and issubclass(anno, Feature):
-                length = len(anno._to_udf_spec())
+                length = sum(1 for _ in self.params._get_flat_tree(subtree, [], 0))
             else:
                 length = 1
             spec_map[name] = anno, length
@@ -208,8 +204,3 @@ class Aggregator(UDFBase):
     is_input_batched = True
     is_output_batched = True
     is_input_grouped = True
-class GroupMapper(UDFBase):
-    is_input_batched = True
-    is_output_batched = True

datachain/lib/udf_signature.py CHANGED Viewed

@@ -20,6 +20,8 @@ class UdfSignature:
     params: Sequence[str]
     output_schema: SignalSchema
+    DEFAULT_RETURN_TYPE = str
     @classmethod
     def parse(
         cls,
@@ -35,7 +37,7 @@ class UdfSignature:
             raise UdfSignatureError(
                 chain,
                 f"multiple signals '{keys}' are not supported in processors."
-                f" Chain multiple processors instead.",
+                " Chain multiple processors instead.",
             )
         if len(signal_map) == 1:
             if func is not None:
@@ -69,7 +71,7 @@ class UdfSignature:
                 raise UdfSignatureError(
                     chain,
                     f"outputs are not defined in function '{udf_func.__name__}'"
-                    f" hints or 'output'",
+                    " hints or 'output'",
                 )
             if not signal_name:
@@ -83,7 +85,7 @@ class UdfSignature:
                 raise UdfSignatureError(
                     chain,
                     f"function '{func}' cannot be used in generator/aggregator"
-                    f" because it returns a type that is not Iterator/Generator."
+                    " because it returns a type that is not Iterator/Generator."
                     f" Instead, it returns '{func_outs_sign}'",
                 )
@@ -127,7 +129,7 @@ class UdfSignature:
                     raise UdfSignatureError(
                         chain,
                         f"output signal '{key}' has type '{type(key)}'"
-                        f" while 'str' is expected",
+                        " while 'str' is expected",
                     )
                 if not Feature.is_feature_type(value):
                     raise UdfSignatureError(
@@ -143,7 +145,7 @@ class UdfSignature:
             raise UdfSignatureError(
                 chain,
                 f"unknown output type: {output}. List of signals or dict of signals"
-                f" to function are expected.",
+                " to function are expected.",
             )
         return udf_output_map
@@ -182,9 +184,12 @@ class UdfSignature:
                 anno = args[0]
                 orig = get_origin(anno)
-            if orig and orig == tuple:
+            if orig and orig is tuple:
                 output_types = tuple(get_args(anno))  # type: ignore[assignment]
             else:
                 output_types = [anno]
+        if not output_types:
+            output_types = [UdfSignature.DEFAULT_RETURN_TYPE]
         return input_map, output_types, is_iterator

datachain/lib/webdataset_laion.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import Optional
 import numpy as np
 from pydantic import Field
+from datachain.lib.feature import Feature
 from datachain.lib.file import File
-from datachain.lib.parquet import BasicParquet
 from datachain.lib.webdataset import WDSBasic, WDSReadableSubclass
@@ -34,19 +34,9 @@ class WDSLaion(WDSBasic):
     json: Laion  # type: ignore[assignment]
-class LaionParquet(BasicParquet):
-    uid: str = Field(default="")
-    url: str = Field(default="")
-    text: str = Field(default="")
-    original_width: int = Field(default=-1)
-    original_height: int = Field(default=-1)
-    clip_b32_similarity_score: float = Field(default=0.0)
-    clip_l14_similarity_score: float = Field(default=0.0)
-    face_bboxes: Optional[list[list[float]]] = Field(default=None)
-    sha256: str = Field(default="")
-class LaionMeta(BasicParquet):
+class LaionMeta(Feature):
+    file: File
+    index: Optional[int] = Field(default=None)
     b32_img: list[float] = Field(default=None)
     b32_txt: list[float] = Field(default=None)
     l14_img: list[float] = Field(default=None)
@@ -65,14 +55,7 @@ def process_laion_meta(file: File) -> Iterator[LaionMeta]:
         for index in range(len(b32_img)):
             yield LaionMeta(
-                file=File(
-                    name=str(index),
-                    source=file.source,
-                    parent=f"{file.get_full_name()}",
-                    version=file.version,
-                    etag=f"{file.etag}_{index}",
-                    location={"vtype": LaionMeta.__name__},
-                ),
+                file=file,
                 index=index,
                 b32_img=b32_img[index],
                 b32_txt=b32_txt[index],

datachain/listing.py CHANGED Viewed

@@ -192,25 +192,25 @@ class Listing:
         dr = self.dataset_rows
         conds = []
         if names:
-            for name in names:
-                conds.append(Column("name").op("GLOB")(name))
+            f = Column("name").op("GLOB")
+            conds.extend(f(name) for name in names)
         if inames:
-            for iname in inames:
-                conds.append(func.lower(Column("name")).op("GLOB")(iname.lower()))
+            f = func.lower(Column("name")).op("GLOB")
+            conds.extend(f(iname.lower()) for iname in inames)
         if paths:
             node_path = case(
                 (Column("parent") == "", Column("name")),
                 else_=Column("parent") + "/" + Column("name"),
             )
-            for path in paths:
-                conds.append(node_path.op("GLOB")(path))
+            f = node_path.op("GLOB")
+            conds.extend(f(path) for path in paths)
         if ipaths:
             node_path = case(
                 (Column("parent") == "", Column("name")),
                 else_=Column("parent") + "/" + Column("name"),
             )
-            for ipath in ipaths:
-                conds.append(func.lower(node_path).op("GLOB")(ipath.lower()))
+            f = func.lower(node_path).op("GLOB")
+            conds.extend(f(ipath.lower()) for ipath in ipaths)
         if size is not None:
             size_limit = suffix_to_number(size)

datachain/node.py CHANGED Viewed

@@ -47,6 +47,7 @@ class DirTypeGroup:
 @attrs.define
 class Node:
     id: int = 0
+    random: int = -1
     vtype: str = ""
     dir_type: Optional[int] = None
     parent: str = ""
@@ -58,7 +59,6 @@ class Node:
     size: int = 0
     owner_name: str = ""
     owner_id: str = ""
-    random: int = -1
     location: Optional[str] = None
     source: StorageURI = StorageURI("")

datachain/progress.py CHANGED Viewed

@@ -21,7 +21,7 @@ def env2bool(var, undefined=False):
     var = os.getenv(var, None)
     if var is None:
         return undefined
-    return bool(re.search("1|y|yes|true", var, flags=re.I))
+    return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
 class Tqdm(tqdm):

datachain/query/builtins.py CHANGED Viewed

@@ -9,7 +9,7 @@ from .udf import udf
 md5 = partial(hashlib.md5, usedforsecurity=False)
-__all__ = ["index_tar", "checksum"]
+__all__ = ["checksum", "index_tar"]
 def load_tar(raw):

datachain 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

datachain 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl