PyPI - datachain - Versions diffs - 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl - Mend

datachain 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (51) hide show

datachain/__init__.py +17 -8
datachain/catalog/catalog.py +5 -5
datachain/cli.py +0 -2
datachain/data_storage/schema.py +5 -5
datachain/data_storage/sqlite.py +1 -1
datachain/data_storage/warehouse.py +7 -7
datachain/lib/arrow.py +25 -8
datachain/lib/clip.py +6 -11
datachain/lib/convert/__init__.py +0 -0
datachain/lib/convert/flatten.py +67 -0
datachain/lib/convert/type_converter.py +96 -0
datachain/lib/convert/unflatten.py +69 -0
datachain/lib/convert/values_to_tuples.py +85 -0
datachain/lib/data_model.py +74 -0
datachain/lib/dc.py +225 -168
datachain/lib/file.py +41 -41
datachain/lib/gpt4_vision.py +1 -9
datachain/lib/hf_image_to_text.py +9 -17
datachain/lib/hf_pipeline.py +4 -12
datachain/lib/image.py +2 -18
datachain/lib/image_transform.py +0 -1
datachain/lib/iptc_exif_xmp.py +8 -15
datachain/lib/meta_formats.py +1 -5
datachain/lib/model_store.py +77 -0
datachain/lib/pytorch.py +9 -21
datachain/lib/signal_schema.py +139 -60
datachain/lib/text.py +5 -16
datachain/lib/udf.py +114 -30
datachain/lib/udf_signature.py +5 -5
datachain/lib/webdataset.py +3 -3
datachain/lib/webdataset_laion.py +2 -3
datachain/node.py +4 -4
datachain/query/batch.py +1 -1
datachain/query/dataset.py +51 -178
datachain/query/dispatch.py +43 -30
datachain/query/udf.py +46 -26
datachain/remote/studio.py +1 -9
datachain/torch/__init__.py +21 -0
datachain/utils.py +39 -0
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/METADATA +14 -12
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/RECORD +45 -43
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/WHEEL +1 -1
datachain/image/__init__.py +0 -3
datachain/lib/cached_stream.py +0 -38
datachain/lib/claude.py +0 -69
datachain/lib/feature.py +0 -412
datachain/lib/feature_registry.py +0 -51
datachain/lib/feature_utils.py +0 -154
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/LICENSE +0 -0
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/entry_points.txt +0 -0
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/top_level.txt +0 -0

datachain/__init__.py CHANGED Viewed

@@ -1,11 +1,16 @@
-from datachain.lib.dc import C, DataChain
-from datachain.lib.feature import Feature
-from datachain.lib.feature_utils import pydantic_to_feature
-from datachain.lib.file import File, FileError, FileFeature, IndexedFile, TarVFile
+from datachain.lib.data_model import DataModel, DataType, FileBasic, is_chain_type
+from datachain.lib.dc import C, Column, DataChain, Sys
+from datachain.lib.file import (
+    File,
+    FileError,
+    ImageFile,
+    IndexedFile,
+    TarVFile,
+    TextFile,
+)
 from datachain.lib.udf import Aggregator, Generator, Mapper
 from datachain.lib.utils import AbstractUDF, DataChainError
 from datachain.query.dataset import UDF as BaseUDF  # noqa: N811
-from datachain.query.schema import Column
 from datachain.query.session import Session
 __all__ = [
@@ -16,14 +21,18 @@ __all__ = [
     "Column",
     "DataChain",
     "DataChainError",
-    "Feature",
+    "DataModel",
+    "DataType",
     "File",
+    "FileBasic",
     "FileError",
-    "FileFeature",
     "Generator",
+    "ImageFile",
     "IndexedFile",
     "Mapper",
     "Session",
+    "Sys",
     "TarVFile",
-    "pydantic_to_feature",
+    "TextFile",
+    "is_chain_type",
 ]

datachain/catalog/catalog.py CHANGED Viewed

@@ -256,7 +256,7 @@ class DatasetRowsFetcher(NodesThreadPool):
                 self.fix_columns(df)
                 # id will be autogenerated in DB
-                df = df.drop("id", axis=1)
+                df = df.drop("sys__id", axis=1)
                 inserted = warehouse.insert_dataset_rows(
                     df, dataset, self.dataset_version
@@ -1041,7 +1041,7 @@ class Catalog:
         If version is None, then next unused version is created.
         If version is given, then it must be an unused version number.
         """
-        assert [c.name for c in columns if c.name != "id"], f"got {columns=}"
+        assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
         if not listing and Client.is_data_source_uri(name):
             raise RuntimeError(
                 "Cannot create dataset that starts with source prefix, e.g s3://"
@@ -1103,7 +1103,7 @@ class Catalog:
         Creates dataset version if it doesn't exist.
         If create_rows is False, dataset rows table will not be created
         """
-        assert [c.name for c in columns if c.name != "id"], f"got {columns=}"
+        assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
         schema = {
             c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
         }
@@ -1433,7 +1433,7 @@ class Catalog:
         if offset:
             q = q.offset(offset)
-        q = q.order_by("id")
+        q = q.order_by("sys__id")
         return q.to_records()
@@ -1786,7 +1786,7 @@ class Catalog:
         schema = DatasetRecord.parse_schema(remote_dataset_version.schema)
         columns = tuple(
-            sa.Column(name, typ) for name, typ in schema.items() if name != "id"
+            sa.Column(name, typ) for name, typ in schema.items() if name != "sys__id"
         )
         # creating new dataset (version) locally
         dataset = self.create_dataset(

datachain/cli.py CHANGED Viewed

@@ -811,8 +811,6 @@ def show(
     from datachain.query import DatasetQuery
     from datachain.utils import show_records
-    if columns:
-        columns = ("id", *columns)
     query = (
         DatasetQuery(name=name, version=version, catalog=catalog)
         .select(*columns)

datachain/data_storage/schema.py CHANGED Viewed

@@ -72,7 +72,7 @@ class DirExpansion:
     @staticmethod
     def base_select(q):
         return sa.select(
-            q.c.id,
+            q.c.sys__id,
             q.c.vtype,
             (q.c.dir_type == DirType.DIR).label("is_dir"),
             q.c.source,
@@ -86,7 +86,7 @@ class DirExpansion:
     def apply_group_by(q):
         return (
             sa.select(
-                f.min(q.c.id).label("id"),
+                f.min(q.c.sys__id).label("sys__id"),
                 q.c.vtype,
                 q.c.is_dir,
                 q.c.source,
@@ -111,7 +111,7 @@ class DirExpansion:
         parent_name = path.name(q.c.parent)
         q = q.union_all(
             sa.select(
-                sa.literal(-1).label("id"),
+                sa.literal(-1).label("sys__id"),
                 sa.literal("").label("vtype"),
                 true().label("is_dir"),
                 q.c.source,
@@ -233,9 +233,9 @@ class DataTable:
     @staticmethod
     def sys_columns():
         return [
-            sa.Column("id", Int, primary_key=True),
+            sa.Column("sys__id", Int, primary_key=True),
             sa.Column(
-                "random", UInt64, nullable=False, server_default=f.abs(f.random())
+                "sys__rand", UInt64, nullable=False, server_default=f.abs(f.random())
             ),
         ]

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -631,7 +631,7 @@ class SQLiteWarehouse(AbstractWarehouse):
             dst_empty = True
         dst_dr = self.dataset_rows(dst, dst_version).table
-        merge_fields = [c.name for c in src_dr.c if c.name != "id"]
+        merge_fields = [c.name for c in src_dr.c if c.name != "sys__id"]
         select_src = select(*(getattr(src_dr.c, f) for f in merge_fields))
         if dst_empty:

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -195,7 +195,7 @@ class AbstractWarehouse(ABC, Serializable):
         cols_names = [c.name for c in cols]
         if not order_by:
-            ordering = [cols.id]
+            ordering = [cols.sys__id]
         else:
             ordering = order_by  # type: ignore[assignment]
@@ -372,7 +372,7 @@ class AbstractWarehouse(ABC, Serializable):
         """Returns total number of rows in a dataset"""
         dr = self.dataset_rows(dataset, version)
         table = dr.get_table()
-        query = select(sa.func.count(table.c.id))
+        query = select(sa.func.count(table.c.sys__id))
         (res,) = self.db.execute(query)
         return res[0]
@@ -388,7 +388,7 @@ class AbstractWarehouse(ABC, Serializable):
         dr = self.dataset_rows(dataset, version)
         table = dr.get_table()
         expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
-            sa.func.count(table.c.id),
+            sa.func.count(table.c.sys__id),
         )
         if "size" in table.columns:
             expressions = (*expressions, sa.func.sum(table.c.size))
@@ -607,7 +607,7 @@ class AbstractWarehouse(ABC, Serializable):
             return func.coalesce(column, default).label(column.name)
         return sa.select(
-            de.c.id,
+            de.c.sys__id,
             with_default(dr.c.vtype),
             case((de.c.is_dir == true(), DirType.DIR), else_=dr.c.dir_type).label(
                 "dir_type"
@@ -621,10 +621,10 @@ class AbstractWarehouse(ABC, Serializable):
             with_default(dr.c.size),
             with_default(dr.c.owner_name),
             with_default(dr.c.owner_id),
-            with_default(dr.c.random),
+            with_default(dr.c.sys__rand),
             dr.c.location,
             de.c.source,
-        ).select_from(de.outerjoin(dr.table, de.c.id == dr.c.id))
+        ).select_from(de.outerjoin(dr.table, de.c.sys__id == dr.c.sys__id))
     def get_node_by_path(self, dataset_rows: "DataTable", path: str) -> Node:
         """Gets node that corresponds to some path"""
@@ -878,7 +878,7 @@ class AbstractWarehouse(ABC, Serializable):
         tbl = sa.Table(
             name,
             sa.MetaData(),
-            sa.Column("id", Int, primary_key=True),
+            sa.Column("sys__id", Int, primary_key=True),
             *columns,
         )
         self.db.create_table(tbl, if_not_exists=True)

datachain/lib/arrow.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import re
+from collections.abc import Sequence
 from typing import TYPE_CHECKING, Optional
+import pyarrow as pa
 from pyarrow.dataset import dataset
 from datachain.lib.file import File, IndexedFile
 from datachain.lib.udf import Generator
 if TYPE_CHECKING:
-    import pyarrow as pa
+    from datachain.lib.dc import DataChain
 class ArrowGenerator(Generator):
@@ -35,12 +37,29 @@ class ArrowGenerator(Generator):
                 index += 1
-def schema_to_output(schema: "pa.Schema"):
+def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
+    schemas = []
+    for file in chain.iterate_one("file"):
+        ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs)  # type: ignore[union-attr]
+        schemas.append(ds.schema)
+    return pa.unify_schemas(schemas)
+def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = None):
     """Generate UDF output schema from pyarrow schema."""
+    if col_names and (len(schema) != len(col_names)):
+        raise ValueError(
+            "Error generating output from Arrow schema - "
+            f"Schema has {len(schema)} columns but got {len(col_names)} column names."
+        )
     default_column = 0
-    output = {"source": IndexedFile}
-    for field in schema:
-        column = field.name.lower()
+    output = {}
+    for i, field in enumerate(schema):
+        if col_names:
+            column = col_names[i]
+        else:
+            column = field.name
+        column = column.lower()
         column = re.sub("[^0-9a-z_]+", "", column)
         if not column:
             column = f"c{default_column}"
@@ -50,12 +69,10 @@ def schema_to_output(schema: "pa.Schema"):
     return output
-def _arrow_type_mapper(col_type: "pa.DataType") -> type:  # noqa: PLR0911
+def _arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
     """Convert pyarrow types to basic types."""
     from datetime import datetime
-    import pyarrow as pa
     if pa.types.is_timestamp(col_type):
         return datetime
     if pa.types.is_binary(col_type):

datachain/lib/clip.py CHANGED Viewed

@@ -1,19 +1,14 @@
 import inspect
-from typing import Any, Callable, Literal, Union
+from typing import TYPE_CHECKING, Any, Callable, Literal, Union
+import torch
+from transformers.modeling_utils import PreTrainedModel
 from datachain.lib.image import convert_images
 from datachain.lib.text import convert_text
-try:
-    import torch
+if TYPE_CHECKING:
     from PIL import Image
-    from transformers.modeling_utils import PreTrainedModel
-except ImportError as exc:
-    raise ImportError(
-        "Missing dependencies for computer vision:\n"
-        "To install run:\n\n"
-        "  pip install 'datachain[cv]'\n"
-    ) from exc
 def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
@@ -37,7 +32,7 @@ def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
 def similarity_scores(
-    images: Union[None, Image.Image, list[Image.Image]],
+    images: Union[None, "Image.Image", list["Image.Image"]],
     text: Union[None, str, list[str]],
     model: Any,
     preprocess: Callable,

datachain/lib/convert/__init__.py ADDED Viewed

File without changes

datachain/lib/convert/flatten.py ADDED Viewed

@@ -0,0 +1,67 @@
+from datetime import datetime
+from pydantic import BaseModel
+from datachain.lib.model_store import ModelStore
+from datachain.sql.types import (
+    JSON,
+    Array,
+    Binary,
+    Boolean,
+    DateTime,
+    Float,
+    Int,
+    Int32,
+    Int64,
+    NullType,
+    String,
+)
+DATACHAIN_TO_TYPE = {
+    Int: int,
+    Int32: int,
+    Int64: int,
+    String: str,
+    Float: float,
+    Boolean: bool,
+    DateTime: datetime,
+    Binary: bytes,
+    Array(NullType): list,
+    JSON: dict,
+}
+def flatten(obj: BaseModel):
+    return tuple(_flatten_fields_values(obj.model_fields, obj))
+def flatten_list(obj_list):
+    return tuple(
+        val for obj in obj_list for val in _flatten_fields_values(obj.model_fields, obj)
+    )
+def _flatten_fields_values(fields, obj: BaseModel):
+    for name, f_info in fields.items():
+        anno = f_info.annotation
+        # Optimization: Access attributes directly to skip the model_dump() call.
+        value = getattr(obj, name)
+        if isinstance(value, list):
+            yield [
+                val.model_dump() if ModelStore.is_pydantic(type(val)) else val
+                for val in value
+            ]
+        elif isinstance(value, dict):
+            yield {
+                key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val
+                for key, val in value.items()
+            }
+        elif ModelStore.is_pydantic(anno):
+            yield from _flatten_fields_values(anno.model_fields, value)
+        else:
+            yield value
+def _flatten(obj):
+    return tuple(_flatten_fields_values(obj.model_fields, obj))

datachain/lib/convert/type_converter.py ADDED Viewed

@@ -0,0 +1,96 @@
+import inspect
+from datetime import datetime
+from enum import Enum
+from typing import Annotated, Literal, Union, get_args, get_origin
+from pydantic import BaseModel
+from typing_extensions import Literal as LiteralEx
+from datachain.lib.model_store import ModelStore
+from datachain.sql.types import (
+    JSON,
+    Array,
+    Binary,
+    Boolean,
+    DateTime,
+    Float,
+    Int64,
+    SQLType,
+    String,
+)
+TYPE_TO_DATACHAIN = {
+    int: Int64,
+    str: String,
+    Literal: String,
+    LiteralEx: String,
+    Enum: String,
+    float: Float,
+    bool: Boolean,
+    datetime: DateTime,  # Note, list of datetime is not supported yet
+    bytes: Binary,  # Note, list of bytes is not supported yet
+    list: Array,
+    dict: JSON,
+}
+def convert_to_db_type(typ):  # noqa: PLR0911
+    if inspect.isclass(typ):
+        if issubclass(typ, SQLType):
+            return typ
+        if issubclass(typ, Enum):
+            return str
+    res = TYPE_TO_DATACHAIN.get(typ)
+    if res:
+        return res
+    orig = get_origin(typ)
+    if orig in (Literal, LiteralEx):
+        return String
+    args = get_args(typ)
+    if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
+        if args is None or len(args) != 1:
+            raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
+        args0 = args[0]
+        if ModelStore.is_pydantic(args0):
+            return Array(JSON())
+        next_type = convert_to_db_type(args0)
+        return Array(next_type)
+    if orig is Annotated:
+        # Ignoring annotations
+        return convert_to_db_type(args[0])
+    if inspect.isclass(orig) and issubclass(dict, orig):
+        return JSON
+    if orig == Union:
+        if len(args) == 2 and (type(None) in args):
+            return convert_to_db_type(args[0])
+        if _is_json_inside_union(orig, args):
+            return JSON
+    raise TypeError(f"Cannot recognize type {typ}")
+def _is_json_inside_union(orig, args) -> bool:
+    if orig == Union and len(args) >= 2:
+        # List in JSON: Union[dict, list[dict]]
+        args_no_nones = [arg for arg in args if arg != type(None)]
+        if len(args_no_nones) == 2:
+            args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
+            if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
+                arg = get_args(args_no_dicts[0])
+                if len(arg) == 1 and arg[0] is dict:
+                    return True
+        # List of objects: Union[MyClass, OtherClass]
+        if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
+            return True
+    return False

datachain/lib/convert/unflatten.py ADDED Viewed

@@ -0,0 +1,69 @@
+import copy
+import inspect
+import re
+from collections.abc import Sequence
+from typing import Any, get_origin
+from pydantic import BaseModel
+from datachain.query.schema import DEFAULT_DELIMITER
+def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos=0) -> dict:
+    return unflatten_to_json_pos(model, row, pos)[0]
+def unflatten_to_json_pos(
+    model: type[BaseModel], row: Sequence[Any], pos=0
+) -> tuple[dict, int]:
+    res = {}
+    for name, f_info in model.model_fields.items():
+        anno = f_info.annotation
+        origin = get_origin(anno)
+        if (
+            origin not in (list, dict)
+            and inspect.isclass(anno)
+            and issubclass(anno, BaseModel)
+        ):
+            res[name], pos = unflatten_to_json_pos(anno, row, pos)
+        else:
+            res[name] = row[pos]
+            pos += 1
+    return res, pos
+def _normalize(name: str) -> str:
+    if DEFAULT_DELIMITER in name:
+        raise RuntimeError(
+            f"variable '{name}' cannot be used "
+            f"because it contains {DEFAULT_DELIMITER}"
+        )
+    return _to_snake_case(name)
+def _to_snake_case(name: str) -> str:
+    """Convert a CamelCase name to snake_case."""
+    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
+def _unflatten_with_path(model: type[BaseModel], dump, name_path: list[str]):
+    res = {}
+    for name, f_info in model.model_fields.items():
+        anno = f_info.annotation
+        name_norm = _normalize(name)
+        lst = copy.copy(name_path)
+        if inspect.isclass(anno) and issubclass(anno, BaseModel):
+            lst.append(name_norm)
+            val = _unflatten_with_path(anno, dump, lst)
+            res[name] = val
+        else:
+            lst.append(name_norm)
+            curr_path = DEFAULT_DELIMITER.join(lst)
+            res[name] = dump[curr_path]
+    return model(**res)
+def unflatten(model: type[BaseModel], dump):
+    return _unflatten_with_path(model, dump, [])

datachain/lib/convert/values_to_tuples.py ADDED Viewed

@@ -0,0 +1,85 @@
+from collections.abc import Sequence
+from typing import Any, Union
+from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
+from datachain.lib.utils import DataChainParamsError
+class ValuesToTupleError(DataChainParamsError):
+    def __init__(self, ds_name, msg):
+        if ds_name:
+            ds_name = f"' {ds_name}'"
+        super().__init__(f"Cannot convert features for dataset{ds_name}: {msg}")
+def values_to_tuples(
+    ds_name: str = "",
+    output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
+    **fr_map,
+) -> tuple[Any, Any, Any]:
+    types_map = {}
+    length = -1
+    for k, v in fr_map.items():
+        if not isinstance(v, Sequence) or isinstance(v, str):
+            raise ValuesToTupleError(ds_name, f"features '{k}' is not a sequence")
+        len_ = len(v)
+        if len_ == 0:
+            raise ValuesToTupleError(ds_name, f"feature '{k}' is empty list")
+        if length < 0:
+            length = len_
+        elif length != len_:
+            raise ValuesToTupleError(
+                ds_name,
+                f"feature '{k}' should have length {length} while {len_} is given",
+            )
+        typ = type(v[0])
+        if not is_chain_type(typ):
+            raise ValuesToTupleError(
+                ds_name,
+                f"feature '{k}' has unsupported type '{typ.__name__}'."
+                f" Please use Feature types: {DataTypeNames}",
+            )
+        types_map[k] = typ
+    if output:
+        if not isinstance(output, Sequence) and not isinstance(output, str):
+            if len(fr_map) != 1:
+                raise ValuesToTupleError(
+                    ds_name,
+                    f"only one output type was specified, {len(fr_map)} expected",
+                )
+            if not isinstance(output, type):
+                raise ValuesToTupleError(
+                    ds_name,
+                    f"output must specify a type while '{output}' was given",
+                )
+            key: str = next(iter(fr_map.keys()))
+            output = {key: output}  # type: ignore[dict-item]
+        if len(output) != len(fr_map):
+            raise ValuesToTupleError(
+                ds_name,
+                f"number of outputs '{len(output)}' should match"
+                f" number of features '{len(fr_map)}'",
+            )
+        if isinstance(output, dict):
+            raise ValuesToTupleError(
+                ds_name,
+                "output type must be dict[str, FeatureType] while "
+                f"'{type(output).__name__}' is given",
+            )
+    else:
+        output = types_map  # type: ignore[assignment]
+    output_types: list[type] = list(output.values())  # type: ignore[union-attr,call-arg,arg-type]
+    if len(output) > 1:  # type: ignore[arg-type]
+        tuple_type = tuple(output_types)
+        res_type = tuple[tuple_type]  # type: ignore[valid-type]
+        res_values = list(zip(*fr_map.values()))
+    else:
+        res_type = output_types[0]  # type: ignore[misc]
+        res_values = next(iter(fr_map.values()))
+    return res_type, output, res_values

datachain 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

Potentially problematic release.

datachain 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl