PyPI - datachain - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

datachain 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (20) hide show

datachain/catalog/catalog.py +7 -1
datachain/cli.py +11 -0
datachain/data_storage/metastore.py +0 -4
datachain/data_storage/schema.py +7 -3
datachain/data_storage/sqlite.py +1 -4
datachain/data_storage/warehouse.py +1 -24
datachain/lib/convert/flatten.py +4 -4
datachain/lib/convert/values_to_tuples.py +4 -1
datachain/lib/dc.py +100 -5
datachain/lib/file.py +23 -22
datachain/lib/meta_formats.py +6 -5
datachain/query/dataset.py +29 -23
datachain/sql/sqlite/base.py +3 -3
datachain/sql/sqlite/types.py +5 -13
{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/METADATA +42 -44
{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/RECORD +20 -20
{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/LICENSE +0 -0
{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/WHEEL +0 -0
{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/entry_points.txt +0 -0
{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import ast
+import glob
 import io
 import json
 import logging
@@ -709,7 +710,12 @@ class Catalog:
         client_config = client_config or self.client_config
         client, path = self.parse_url(source, **client_config)
-        prefix = posixpath.dirname(path)
+        stem = os.path.basename(os.path.normpath(path))
+        prefix = (
+            posixpath.dirname(path)
+            if glob.has_magic(stem) or client.fs.isfile(source)
+            else path
+        )
         storage_dataset_name = Storage.dataset_name(
             client.uri, posixpath.join(prefix, "")
         )

datachain/cli.py CHANGED Viewed

@@ -491,6 +491,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         type=int,
         help="Dataset version",
     )
+    show_parser.add_argument("--schema", action="store_true", help="Show schema")
     add_show_args(show_parser)
     query_parser = subp.add_parser(
@@ -816,10 +817,15 @@ def show(
     offset: int = 0,
     columns: Sequence[str] = (),
     no_collapse: bool = False,
+    schema: bool = False,
 ) -> None:
+    from datachain.lib.dc import DataChain
     from datachain.query import DatasetQuery
     from datachain.utils import show_records
+    dataset = catalog.get_dataset(name)
+    dataset_version = dataset.get_version(version or dataset.latest_version)
     query = (
         DatasetQuery(name=name, version=version, catalog=catalog)
         .select(*columns)
@@ -828,6 +834,10 @@ def show(
     )
     records = query.to_db_records()
     show_records(records, collapse_columns=not no_collapse)
+    if schema and dataset_version.feature_schema:
+        print("\nSchema:")
+        dc = DataChain(name=name, version=version, catalog=catalog)
+        dc.print_schema()
 def query(
@@ -1013,6 +1023,7 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 offset=args.offset,
                 columns=args.columns,
                 no_collapse=args.no_collapse,
+                schema=args.schema,
             )
         elif args.command == "rm-dataset":
             rm_dataset(catalog, args.name, version=args.version, force=args.force)

datachain/data_storage/metastore.py CHANGED Viewed

@@ -421,10 +421,6 @@ class AbstractMetastore(ABC, Serializable):
     ) -> None:
         """Set the status of the given job and dataset."""
-    @abstractmethod
-    def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
-        """Returns the possibly stale jobs."""
 class AbstractDBMetastore(AbstractMetastore):
     """

datachain/data_storage/schema.py CHANGED Viewed

@@ -19,8 +19,12 @@ from datachain.sql.types import Int, SQLType, UInt64
 if TYPE_CHECKING:
     from sqlalchemy import Engine
     from sqlalchemy.engine.interfaces import Dialect
-    from sqlalchemy.sql.base import Executable, ReadOnlyColumnCollection
-    from sqlalchemy.sql.elements import KeyedColumnElement
+    from sqlalchemy.sql.base import (
+        ColumnCollection,
+        Executable,
+        ReadOnlyColumnCollection,
+    )
+    from sqlalchemy.sql.elements import ColumnElement
 def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
@@ -43,7 +47,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
 def convert_rows_custom_column_types(
-    columns: "ReadOnlyColumnCollection[str, KeyedColumnElement[Any]]",
+    columns: "ColumnCollection[str, ColumnElement[Any]]",
     rows: Iterator[tuple[Any, ...]],
     dialect: "Dialect",
 ):

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -496,9 +496,6 @@ class SQLiteMetastore(AbstractDBMetastore):
     def _jobs_insert(self) -> "Insert":
         return sqlite.insert(self._jobs)
-    def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
-        raise NotImplementedError("get_possibly_stale_jobs not implemented for SQLite")
 class SQLiteWarehouse(AbstractWarehouse):
     """
@@ -594,7 +591,7 @@ class SQLiteWarehouse(AbstractWarehouse):
     ):
         rows = self.db.execute(select_query, **kwargs)
         yield from convert_rows_custom_column_types(
-            select_query.columns, rows, sqlite_dialect
+            select_query.selected_columns, rows, sqlite_dialect
         )
     def get_dataset_sources(

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -494,7 +494,7 @@ class AbstractWarehouse(ABC, Serializable):
         This gets nodes based on the provided query, and should be used sparingly,
         as it will be slow on any OLAP database systems.
         """
-        columns = [c.name for c in query.columns]
+        columns = [c.name for c in query.selected_columns]
         for row in self.db.execute(query):
             d = dict(zip(columns, row))
             yield Node(**d)
@@ -912,29 +912,6 @@ class AbstractWarehouse(ABC, Serializable):
         for name in names:
             self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
-    def subtract_query(
-        self,
-        source_query: sa.sql.selectable.Select,
-        target_query: sa.sql.selectable.Select,
-    ) -> sa.sql.selectable.Select:
-        sq = source_query.alias("source_query")
-        tq = target_query.alias("target_query")
-        source_target_join = sa.join(
-            sq,
-            tq,
-            (sq.c.source == tq.c.source)
-            & (sq.c.parent == tq.c.parent)
-            & (sq.c.name == tq.c.name),
-            isouter=True,
-        )
-        return (
-            select(*sq.c)
-            .select_from(source_target_join)
-            .where((tq.c.name == None) | (tq.c.name == ""))  # noqa: E711
-        )
     def changed_query(
         self,
         source_query: sa.sql.selectable.Select,

datachain/lib/convert/flatten.py CHANGED Viewed

@@ -48,10 +48,10 @@ def _flatten_fields_values(fields, obj: BaseModel):
         value = getattr(obj, name)
         if isinstance(value, list):
-            yield [
-                val.model_dump() if ModelStore.is_pydantic(type(val)) else val
-                for val in value
-            ]
+            if value and ModelStore.is_pydantic(type(value[0])):
+                yield [val.model_dump() for val in value]
+            else:
+                yield value
         elif isinstance(value, dict):
             yield {
                 key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val

datachain/lib/convert/values_to_tuples.py CHANGED Viewed

@@ -71,7 +71,10 @@ def values_to_tuples(  # noqa: C901, PLR0912
                     f"signal '{k}' has unsupported type '{typ.__name__}'."
                     f" Please use DataModel types: {DataTypeNames}",
                 )
-            types_map[k] = typ
+            if typ is list:
+                types_map[k] = list[type(v[0][0])]  # type: ignore[misc]
+            else:
+                types_map[k] = typ
         if length < 0:
             length = len_

datachain/lib/dc.py CHANGED Viewed

@@ -342,7 +342,7 @@ class DataChain(DatasetQuery):
         spec: Optional[DataType] = None,
         schema_from: Optional[str] = "auto",
         jmespath: Optional[str] = None,
-        object_name: str = "",
+        object_name: Optional[str] = "",
         model_name: Optional[str] = None,
         show_schema: Optional[bool] = False,
         meta_type: Optional[str] = "json",
@@ -364,12 +364,12 @@ class DataChain(DatasetQuery):
             nrows : optional row limit for jsonl and JSON arrays
         Example:
-            infer JSON schema from data, reduce using JMESPATH, print schema
+            infer JSON schema from data, reduce using JMESPATH
             ```py
             chain = DataChain.from_json("gs://json", jmespath="key1.key2")
             ```
-            infer JSON schema from a particular path, print data model
+            infer JSON schema from a particular path
             ```py
             chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
             ```
@@ -384,7 +384,7 @@ class DataChain(DatasetQuery):
         if (not object_name) and jmespath:
             object_name = jmespath_to_name(jmespath)
         if not object_name:
-            object_name = "json"
+            object_name = meta_type
         chain = DataChain.from_storage(path=path, type=type, **kwargs)
         signal_dict = {
             object_name: read_meta(
@@ -397,7 +397,67 @@ class DataChain(DatasetQuery):
                 nrows=nrows,
             )
         }
-        return chain.gen(**signal_dict)  # type: ignore[arg-type]
+        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
+    @classmethod
+    def from_jsonl(
+        cls,
+        path,
+        type: Literal["binary", "text", "image"] = "text",
+        spec: Optional[DataType] = None,
+        schema_from: Optional[str] = "auto",
+        jmespath: Optional[str] = None,
+        object_name: Optional[str] = "",
+        model_name: Optional[str] = None,
+        show_schema: Optional[bool] = False,
+        meta_type: Optional[str] = "jsonl",
+        nrows=None,
+        **kwargs,
+    ) -> "DataChain":
+        """Get data from JSON lines. It returns the chain itself.
+        Parameters:
+            path : storage URI with directory. URI must start with storage prefix such
+                as `s3://`, `gs://`, `az://` or "file:///"
+            type : read file as "binary", "text", or "image" data. Default is "binary".
+            spec : optional Data Model
+            schema_from : path to sample to infer spec (if schema not provided)
+            object_name : generated object column name
+            model_name : optional generated model name
+            show_schema : print auto-generated schema
+            jmespath : optional JMESPATH expression to reduce JSON
+            nrows : optional row limit for jsonl and JSON arrays
+        Example:
+            infer JSONl schema from data, limit parsing to 1 row
+            ```py
+            chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
+            ```
+        """
+        if schema_from == "auto":
+            schema_from = path
+        def jmespath_to_name(s: str):
+            name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]
+            return s[:name_end]
+        if (not object_name) and jmespath:
+            object_name = jmespath_to_name(jmespath)
+        if not object_name:
+            object_name = meta_type
+        chain = DataChain.from_storage(path=path, type=type, **kwargs)
+        signal_dict = {
+            object_name: read_meta(
+                schema_from=schema_from,
+                meta_type=meta_type,
+                spec=spec,
+                model_name=model_name,
+                show_schema=show_schema,
+                jmespath=jmespath,
+                nrows=nrows,
+            )
+        }
+        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
     @classmethod
     def datasets(
@@ -951,6 +1011,41 @@ class DataChain(DatasetQuery):
         return ds
+    def subtract(  # type: ignore[override]
+        self,
+        other: "DataChain",
+        on: Optional[Union[str, Sequence[str]]] = None,
+    ) -> "Self":
+        """Remove rows that appear in another chain.
+        Parameters:
+            other: chain whose rows will be removed from `self`
+            on: columns to consider for determining row equality. If unspecified,
+                defaults to all common columns between `self` and `other`.
+        """
+        if isinstance(on, str):
+            on = [on]
+        if on is None:
+            other_columns = set(other._effective_signals_schema.db_signals())
+            signals = [
+                c
+                for c in self._effective_signals_schema.db_signals()
+                if c in other_columns
+            ]
+            if not signals:
+                raise DataChainParamsError("subtract(): no common columns")
+        elif not isinstance(on, Sequence):
+            raise TypeError(
+                f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
+            )
+        elif not on:
+            raise DataChainParamsError(
+                "'on' cannot be empty",
+            )
+        else:
+            signals = self.signals_schema.resolve(*on).db_signals()
+        return super()._subtract(other, signals)
     @classmethod
     def from_values(
         cls,

datachain/lib/file.py CHANGED Viewed

@@ -12,7 +12,6 @@ from urllib.parse import unquote, urlparse
 from urllib.request import url2pathname
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
-from fsspec.implementations.local import LocalFileSystem
 from PIL import Image
 from pydantic import Field, field_validator
@@ -20,7 +19,7 @@ from datachain.cache import UniqueId
 from datachain.client.fileslice import FileSlice
 from datachain.lib.data_model import DataModel
 from datachain.lib.utils import DataChainError
-from datachain.sql.types import JSON, Int, String
+from datachain.sql.types import JSON, Boolean, DateTime, Int, String
 from datachain.utils import TIME_ZERO
 if TYPE_CHECKING:
@@ -126,11 +125,13 @@ class File(DataModel):
         "source": String,
         "parent": String,
         "name": String,
+        "size": Int,
         "version": String,
         "etag": String,
-        "size": Int,
-        "vtype": String,
+        "is_latest": Boolean,
+        "last_modified": DateTime,
         "location": JSON,
+        "vtype": String,
     }
     _unique_id_keys: ClassVar[list[str]] = [
@@ -214,7 +215,7 @@ class File(DataModel):
         with self.open(mode="r") as stream:
             return stream.read()
-    def write(self, destination: str):
+    def save(self, destination: str):
         """Writes it's content to destination"""
         with open(destination, mode="wb") as f:
             f.write(self.read())
@@ -232,7 +233,7 @@ class File(DataModel):
         dst_dir = os.path.dirname(dst)
         os.makedirs(dst_dir, exist_ok=True)
-        self.write(dst)
+        self.save(dst)
     def _set_stream(
         self,
@@ -281,9 +282,8 @@ class File(DataModel):
     def get_path(self) -> str:
         """Returns file path."""
         path = unquote(self.get_uri())
-        fs = self.get_fs()
-        if isinstance(fs, LocalFileSystem):
-            # Drop file:// protocol
+        source = urlparse(self.source)
+        if source.scheme == "file":
             path = urlparse(path).path
             path = url2pathname(path)
         return path
@@ -298,13 +298,10 @@ class File(DataModel):
         elif placement == "etag":
             path = f"{self.etag}{self.get_file_suffix()}"
         elif placement == "fullpath":
-            fs = self.get_fs()
-            if isinstance(fs, LocalFileSystem):
-                path = unquote(self.get_full_name())
-            else:
-                path = (
-                    Path(urlparse(self.source).netloc) / unquote(self.get_full_name())
-                ).as_posix()
+            path = unquote(self.get_full_name())
+            source = urlparse(self.source)
+            if source.scheme and source.scheme != "file":
+                path = posixpath.join(source.netloc, path)
         elif placement == "checksum":
             raise NotImplementedError("Checksum placement not implemented yet")
         else:
@@ -330,7 +327,7 @@ class TextFile(File):
         with self.open() as stream:
             return stream.read()
-    def write(self, destination: str):
+    def save(self, destination: str):
         """Writes it's content to destination"""
         with open(destination, mode="w") as f:
             f.write(self.read_text())
@@ -344,7 +341,7 @@ class ImageFile(File):
         fobj = super().read()
         return Image.open(BytesIO(fobj))
-    def write(self, destination: str):
+    def save(self, destination: str):
         """Writes it's content to destination"""
         self.read().save(destination)
@@ -360,21 +357,25 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
         source: str,
         parent: str,
         name: str,
+        size: int,
         version: str,
         etag: str,
-        size: int,
-        vtype: str,
+        is_latest: bool,
+        last_modified: datetime,
         location: Optional[Union[dict, list[dict]]],
+        vtype: str,
     ) -> file:  # type: ignore[valid-type]
         return file(
             source=source,
             parent=parent,
             name=name,
+            size=size,
             version=version,
             etag=etag,
-            size=size,
-            vtype=vtype,
+            is_latest=is_latest,
+            last_modified=last_modified,
             location=location,
+            vtype=vtype,
         )
     return get_file_type

datachain/lib/meta_formats.py CHANGED Viewed

@@ -11,9 +11,9 @@ from collections.abc import Iterator
 from typing import Any, Callable
 import jmespath as jsp
-from pydantic import ValidationError
+from pydantic import Field, ValidationError  # noqa: F401
-from datachain.lib.data_model import ModelStore  # noqa: F401
+from datachain.lib.data_model import DataModel  # noqa: F401
 from datachain.lib.file import File
@@ -87,7 +87,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
     except subprocess.CalledProcessError as e:
         model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
     print(f"{model_output}")
-    print("\n" + f"ModelStore.register({model_name})" + "\n")
+    print("\n" + "from datachain.lib.data_model import DataModel" + "\n")
+    print("\n" + f"DataModel.register({model_name})" + "\n")
     print("\n" + f"spec={model_name}" + "\n")
     return model_output
@@ -147,7 +148,7 @@ def read_meta(  # noqa: C901
     def parse_data(
         file: File,
-        DataModel=spec,  # noqa: N803
+        data_model=spec,
         meta_type=meta_type,
         jmespath=jmespath,
         nrows=nrows,
@@ -155,7 +156,7 @@ def read_meta(  # noqa: C901
         def validator(json_object: dict) -> spec:
             json_string = json.dumps(json_object)
             try:
-                data_instance = DataModel.model_validate_json(json_string)
+                data_instance = data_model.model_validate_json(json_string)
                 yield data_instance
             except ValidationError as e:
                 print(f"Validation error occurred in file {file.name}:", e)

datachain/query/dataset.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import (
 import attrs
 import sqlalchemy
+import sqlalchemy as sa
 from attrs import frozen
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
 from sqlalchemy import Column
@@ -250,7 +251,7 @@ class DatasetDiffOperation(Step):
         self,
         source_query: Select,
         target_query: Select,
-    ) -> Select:
+    ) -> sa.Selectable:
         """
         Should return select query that calculates desired diff between dataset queries
         """
@@ -268,7 +269,7 @@ class DatasetDiffOperation(Step):
         columns = [
             c if isinstance(c, Column) else Column(c.name, c.type)
-            for c in source_query.columns
+            for c in source_query.selected_columns
         ]
         temp_table = self.catalog.warehouse.create_dataset_rows_table(
             temp_table_name,
@@ -292,23 +293,16 @@ class DatasetDiffOperation(Step):
 @frozen
 class Subtract(DatasetDiffOperation):
-    """
-    Calculates rows that are in a source query but are not in target query (diff)
-    This can be used to do delta updates (calculate UDF only on newly added rows)
-    Example:
-        >>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
-        >>> ds_updated = (
-                DatasetQuery("gs://dvcx-datalakes/dogs-and-cats")
-                .filter(C.size > 1000) # we can also filter out source query
-                .subtract(ds)
-                .add_signals(calc_embeddings) # calculae embeddings only on new rows
-                .union(ds) # union with old dataset that's missing new rows
-                .save("dogs_cats_updated")
-            )
-    """
+    on: Sequence[str]
-    def query(self, source_query: Select, target_query: Select) -> Select:
-        return self.catalog.warehouse.subtract_query(source_query, target_query)
+    def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
+        sq = source_query.alias("source_query")
+        tq = target_query.alias("target_query")
+        where_clause = sa.and_(
+            getattr(sq.c, col_name).is_not_distinct_from(getattr(tq.c, col_name))
+            for col_name in self.on
+        )  # type: ignore[arg-type]
+        return sq.select().except_(sq.select().where(where_clause))
 @frozen
@@ -820,8 +814,16 @@ class SQLMutate(SQLClause):
     args: tuple[ColumnElement, ...]
     def apply_sql_clause(self, query: Select) -> Select:
-        subquery = query.subquery()
-        return sqlalchemy.select(*subquery.c, *self.args).select_from(subquery)
+        original_subquery = query.subquery()
+        # this is needed for new column to be used in clauses
+        # like ORDER BY, otherwise new column is not recognized
+        subquery = (
+            sqlalchemy.select(*original_subquery.c, *self.args)
+            .select_from(original_subquery)
+            .subquery()
+        )
+        return sqlalchemy.select(*subquery.c).select_from(subquery)
 @frozen
@@ -1252,7 +1254,7 @@ class DatasetQuery:
     def as_iterable(self, **kwargs) -> Iterator[ResultIter]:
         try:
             query = self.apply_steps().select()
-            selected_columns = [c.name for c in query.columns]
+            selected_columns = [c.name for c in query.selected_columns]
             yield ResultIter(
                 self.catalog.warehouse.dataset_rows_select(query, **kwargs),
                 selected_columns,
@@ -1556,8 +1558,12 @@ class DatasetQuery:
     @detach
     def subtract(self, dq: "DatasetQuery") -> "Self":
+        return self._subtract(dq, on=["source", "parent", "name"])
+    @detach
+    def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":
         query = self.clone()
-        query.steps.append(Subtract(dq, self.catalog))
+        query.steps.append(Subtract(dq, self.catalog, on=on))
         return query
     @detach
@@ -1676,7 +1682,7 @@ class DatasetQuery:
                     f.row_number().over(order_by=q._order_by_clauses).label("sys__id")
                 )
-            cols = tuple(c.name for c in q.columns)
+            cols = tuple(c.name for c in q.selected_columns)
             insert_q = sqlalchemy.insert(dr.get_table()).from_select(cols, q)
             self.catalog.warehouse.db.execute(insert_q, **kwargs)
             self.catalog.metastore.update_dataset_status(

datachain/sql/sqlite/base.py CHANGED Viewed

@@ -5,8 +5,8 @@ from datetime import MAXYEAR, MINYEAR, datetime, timezone
 from types import MappingProxyType
 from typing import Callable, Optional
+import orjson
 import sqlalchemy as sa
-import ujson
 from sqlalchemy.dialects import sqlite
 from sqlalchemy.ext.compiler import compiles
 from sqlalchemy.sql.elements import literal
@@ -149,7 +149,7 @@ def missing_vector_function(name, exc):
 def sqlite_string_split(string: str, sep: str, maxsplit: int = -1) -> str:
-    return ujson.dumps(string.split(sep, maxsplit))
+    return orjson.dumps(string.split(sep, maxsplit)).decode("utf-8")
 def register_user_defined_sql_functions() -> None:
@@ -274,7 +274,7 @@ def compile_euclidean_distance(element, compiler, **kwargs):
 def py_json_array_length(arr):
-    return len(ujson.loads(arr))
+    return len(orjson.loads(arr))
 def compile_array_length(element, compiler, **kwargs):

datachain/sql/sqlite/types.py CHANGED Viewed

@@ -1,7 +1,6 @@
-import json
 import sqlite3
-import ujson
+import orjson
 from sqlalchemy import types
 from datachain.sql.types import TypeConverter, TypeReadConverter
@@ -29,22 +28,15 @@ class Array(types.UserDefinedType):
 def adapt_array(arr):
-    return ujson.dumps(arr)
+    return orjson.dumps(arr).decode("utf-8")
 def convert_array(arr):
-    return ujson.loads(arr)
+    return orjson.loads(arr)
 def adapt_np_array(arr):
-    def _json_serialize(obj):
-        if isinstance(obj, np.ndarray):
-            return obj.tolist()
-        return obj
-    if np.issubdtype(arr.dtype, np.object_):
-        return json.dumps(arr.tolist(), default=_json_serialize)
-    return ujson.dumps(arr.tolist())
+    return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode("utf-8")
 def adapt_np_generic(val):
@@ -70,5 +62,5 @@ class SQLiteTypeConverter(TypeConverter):
 class SQLiteTypeReadConverter(TypeReadConverter):
     def array(self, value, item_type, dialect):
         if isinstance(value, str):
-            value = ujson.loads(value)
+            value = orjson.loads(value)
         return super().array(value, item_type, dialect)

{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.2.12
+Version: 0.2.14
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -35,7 +35,7 @@ Requires-Dist: sqlalchemy >=2
 Requires-Dist: multiprocess ==0.70.16
 Requires-Dist: dill ==0.3.8
 Requires-Dist: cloudpickle
-Requires-Dist: ujson >=5.9.0
+Requires-Dist: orjson >=3.10.5
 Requires-Dist: pydantic <3,>=2
 Requires-Dist: jmespath >=1.0
 Requires-Dist: datamodel-code-generator >=0.25
@@ -45,9 +45,9 @@ Provides-Extra: dev
 Requires-Dist: datachain[docs,tests] ; extra == 'dev'
 Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
 Requires-Dist: types-python-dateutil ; extra == 'dev'
+Requires-Dist: types-pytz ; extra == 'dev'
 Requires-Dist: types-PyYAML ; extra == 'dev'
 Requires-Dist: types-requests ; extra == 'dev'
-Requires-Dist: types-ujson ; extra == 'dev'
 Provides-Extra: docs
 Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
 Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
@@ -103,20 +103,18 @@ AI 🔗 DataChain
 DataChain is an open-source Python library for processing and curating unstructured
 data at scale.
-🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
+🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
-🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
+🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
-🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
+🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
-To ensure efficiency, Datachain supports parallel processing, parallel data
-downloads, and out-of-memory computing. It excels at optimizing batch operations.
-While most GenAI tools focus on online applications and realtime, DataChain is designed
-for offline data processing, data curation and ETL.
+Datachain supports parallel processing, parallel data
+downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
-The typical use cases are Computer Vision data curation, LLM analytics
-and validation.
+The typical use cases include Computer Vision data curation, LLM analytics,
+and validation of multimodal AI applications.
 .. code:: console
@@ -128,25 +126,25 @@ and validation.
 Quick Start
 -----------
-Basic evaluation
-================
+Data curation with a local model
+=================================
 We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
-- 50 files total in the example.
-These dialogs involve users looking for better wireless plans chatting with bot.
-Our goal is to identify successful dialogs.
+- 50 files total in this example.
+These dialogs involve users chatting with a bot while looking for better wireless plans.
+Our goal is to identify the successful dialogs.
-The data used in the examples is publicly available. Please feel free to run this code.
+The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
-First, we'll use a simple sentiment analysis model. Please install transformers.
+First, we'll show batch inference with a simple sentiment model using the `transformers` library:
 .. code:: shell
     pip install transformers
-The code below downloads files the cloud, applies function
-`is_positive_dialogue_ending()` to each. All files with a positive sentiment
-are copied to local directory `output/`.
+The code below downloads files the cloud, and applies a user-defined function
+to each one of them. All files with a positive sentiment
+detected are then copied to the local directory.
 .. code:: py
@@ -169,7 +167,7 @@ are copied to local directory `output/`.
     )
     positive_chain = chain.filter(Column("is_positive") == True)
-    positive_chain.export_files("./output1")
+    positive_chain.export_files("./output")
     print(f"{positive_chain.count()} files were exported")
@@ -185,11 +183,11 @@ are copied to local directory `output/`.
     13
-LLM judging LLMs dialogs
-==========================
+LLM judging chatbots
+=============================
-Finding good dialogs using an LLM can be more efficient. In this example,
-we use Mistral with a free API. Please install the package and get a free
+LLMs can work as efficient universal classifiers. In the example below,
+we employ a free API from Mistral to judge the chatbot performance. Please get a free
 Mistral API key at https://console.mistral.ai
 .. code:: shell
@@ -197,9 +195,7 @@ Mistral API key at https://console.mistral.ai
     $ pip install mistralai
     $ export MISTRAL_API_KEY=_your_key_
-Below is a similar code example, but this time using an LLM to evaluate the dialogs.
-Note, only 4 threads were used in this example `parallel=4` due to a limitation of
-the free LLM service.
+DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
 .. code:: py
@@ -231,7 +227,7 @@ the free LLM service.
     print(f"{successful_chain.count()} files were exported")
-With the current prompt, we found 31 files considered successful dialogs:
+With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
 .. code:: shell
@@ -245,11 +241,11 @@ With the current prompt, we found 31 files considered successful dialogs:
 Serializing Python-objects
 ==========================
-LLM responses contain valuable information for analytics, such as tokens used and the
-model. Preserving this information can be beneficial.
+LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
+model performance parameters.
-Instead of extracting this information from the Mistral data structure (class
-`ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
+Instead of extracting this information from the Mistral response data structure (class
+`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
 .. code:: py
@@ -297,21 +293,23 @@ Output:
     64.0% dialogs were successful
-Complex Python data structures
+Iterating over Python data structures
 =============================================
-In the previous examples, a few dataset were saved in the embedded database
-(`SQLite`_ in directory `.datachain`).
-These datasets are versioned, and can be accessed using
+In the previous examples, datasets were saved in the embedded database
+(`SQLite`_ in folder `.datachain` of the working directory).
+These datasets were automatically versioned, and can be accessed using
 `DataChain.from_dataset("dataset_name")`.
+Here is how to retrieve a saved dataset and iterate over the objects:
 .. code:: py
     chain = DataChain.from_dataset("response")
-    # Iterating one-by-one: out of memory
+    # Iterating one-by-one: support out-of-memory workflow
     for file, response in chain.limit(5).collect("file", "response"):
-        # You work with Python objects
+        # verify the collected Python objects
         assert isinstance(response, ChatCompletionResponse)
         status = response.choices[0].message.content[:7]
@@ -332,9 +330,8 @@ Output:
 Vectorized analytics over Python objects
 ========================================
-Some operations can be efficiently run inside the DB without deserializing Python objects.
-Let's calculate the cost of using LLM APIs in a vectorized way.
-Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
+Some operations can run inside the DB without deserialization.
+For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
 .. code:: py
@@ -406,6 +403,7 @@ Community and Support
 .. github-only
 .. _Contributor Guide: CONTRIBUTING.rst
 .. _Pydantic: https://github.com/pydantic/pydantic
+.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
 .. _SQLite: https://www.sqlite.org/
 .. _Getting Started: https://datachain.dvc.ai/
 .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true

{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
 datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
 datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
 datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
-datachain/cli.py,sha256=MSOID2t-kesk5Z80uoepN63rqvB7iZxaWYLqkiWehkQ,32628
+datachain/cli.py,sha256=Twb6BXjNxfAAGj42dUOJ7Ah5etkrTDVfMzAmINWUSOI,33104
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
 datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
 datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
 datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=u8tvWooIon9ju59q8-Re_iqflgbCB-JMZD8n2UC4iag,80397
+datachain/catalog/catalog.py,sha256=ab-PLPa9CMeHCo9asHjkqw4mZ6tHM4x8bsswfMtr65w,80575
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
 datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -32,20 +32,20 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
 datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
 datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
 datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
-datachain/data_storage/metastore.py,sha256=R1Jj8dOTAex8fjehewV2vUO4VhBSjj8JQI5mM3YhVEQ,54989
-datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
+datachain/data_storage/metastore.py,sha256=wVcT8MiSH_paWEXN6eZ8Z3msrHY6vWtVFTH5kwHteRE,54852
+datachain/data_storage/schema.py,sha256=FQvt5MUMSnI5ZAE7Nthae4aaJpt8JC4nH8KiWDuhJkk,8135
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
-datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
-datachain/data_storage/warehouse.py,sha256=FedcsvkAphpi2tUnlcrxO4mYumiCQAcrB5XRAK9tfXQ,33288
+datachain/data_storage/sqlite.py,sha256=i4h8ZY15A2YNXd2PU5BZPoRaBqqs9lOdPtBjC0BZy3s,24935
+datachain/data_storage/warehouse.py,sha256=fQO6UZc2MFgFPRnpCQW7c1GCl3FJBYE4dtA_ZXWuA8M,32627
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/arrow.py,sha256=WBZ4iVU0CcmCgog1wS-Nrtqhzvf2I4_QqDJtzhaECeA,3641
 datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
 datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
 datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
-datachain/lib/dc.py,sha256=KboCSSyjZ69hIpyjgza4HindFwO7L1Usxa0769N57NA,50561
-datachain/lib/file.py,sha256=xiLHaqyl4rqcBLGD62YD3aBIAOmX4EBVucxIncpRi80,11916
+datachain/lib/dc.py,sha256=I3BLJJK17kB8velBSCTjtoR8CcPZOHPgFTibS9OclmY,54155
+datachain/lib/file.py,sha256=n9GBmZ1CjzDjHkbUBsUrs8JOJrAoh3MV2Cc8hBkex20,11957
 datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
-datachain/lib/meta_formats.py,sha256=Z2NVH5X4N2rrj5kFxKsHKq3zD4kaRHbDCx3oiUEKYUk,6920
+datachain/lib/meta_formats.py,sha256=WRjUzaBKo0IJFHhKz7dxzAKXjR4OvuzsLjkdjyewL6Q,7001
 datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
 datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
 datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
@@ -58,15 +58,15 @@ datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
 datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
 datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
+datachain/lib/convert/flatten.py,sha256=vrj2Kg-I1YAq2OGAFIwFUqtIesGpweve3c1ipeFOvDQ,1615
 datachain/lib/convert/python_to_sql.py,sha256=54G6dsMhxo1GKCzPziOqCKo2d4VRWmsJhJYRJxt1Thw,2615
 datachain/lib/convert/sql_to_python.py,sha256=HK414fexSQ4Ur-OY7_pKvDKEGdtos1CeeAFa4RxH4nU,532
 datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
-datachain/lib/convert/values_to_tuples.py,sha256=Bh8L4zA66XRhQxmONvLvn94_i8MBMYgfJ6A2i7l_6Jo,3592
+datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
 datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
 datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
 datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
-datachain/query/dataset.py,sha256=m0bDQK_xXB85KPdJpH3OHdW6WJd1_PMgi01GRcWiiSg,61280
+datachain/query/dataset.py,sha256=VhsbHTOps-E4_trLzkJWGQV3zblN6LdlyHED9-3H5Vo,61388
 datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
 datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -88,13 +88,13 @@ datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0
 datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
 datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
 datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
-datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
-datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
+datachain/sql/sqlite/base.py,sha256=Jb1csbIARjEvwbylnvgNA7ChozSyoL3CQzOGBUf8QAw,12067
+datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.2.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.2.12.dist-info/METADATA,sha256=QfDhY5jkblcb94A5CxT-ELhDcwDzZq1ju4cPQXHDEkY,14333
-datachain-0.2.12.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
-datachain-0.2.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.2.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.2.12.dist-info/RECORD,,
+datachain-0.2.14.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.2.14.dist-info/METADATA,sha256=UiBiVmF8nF2aIimMNPn3XB14OhIbRj0w4w5q72qTaRM,14577
+datachain-0.2.14.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
+datachain-0.2.14.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.2.14.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.2.14.dist-info/RECORD,,

{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.2.12.dist-info → datachain-0.2.14.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

Potentially problematic release.

datachain 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl