PyPI - datachain - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl - Mend

datachain 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (24) hide show

datachain/data_storage/metastore.py +0 -4
datachain/data_storage/schema.py +7 -3
datachain/data_storage/sqlite.py +22 -4
datachain/data_storage/warehouse.py +25 -26
datachain/lib/arrow.py +27 -8
datachain/lib/convert/flatten.py +10 -5
datachain/lib/convert/python_to_sql.py +1 -1
datachain/lib/convert/values_to_tuples.py +4 -1
datachain/lib/data_model.py +6 -1
datachain/lib/dc.py +206 -29
datachain/lib/file.py +6 -11
datachain/lib/meta_formats.py +12 -11
datachain/lib/settings.py +1 -17
datachain/lib/udf.py +18 -10
datachain/query/dataset.py +24 -65
datachain/sql/sqlite/base.py +3 -3
datachain/sql/sqlite/types.py +5 -13
datachain/sql/types.py +5 -1
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/METADATA +2 -3
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/RECORD +24 -24
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/WHEEL +1 -1
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/LICENSE +0 -0
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/entry_points.txt +0 -0
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/top_level.txt +0 -0

datachain/data_storage/metastore.py CHANGED Viewed

@@ -421,10 +421,6 @@ class AbstractMetastore(ABC, Serializable):
     ) -> None:
         """Set the status of the given job and dataset."""
-    @abstractmethod
-    def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
-        """Returns the possibly stale jobs."""
 class AbstractDBMetastore(AbstractMetastore):
     """

datachain/data_storage/schema.py CHANGED Viewed

@@ -19,8 +19,12 @@ from datachain.sql.types import Int, SQLType, UInt64
 if TYPE_CHECKING:
     from sqlalchemy import Engine
     from sqlalchemy.engine.interfaces import Dialect
-    from sqlalchemy.sql.base import Executable, ReadOnlyColumnCollection
-    from sqlalchemy.sql.elements import KeyedColumnElement
+    from sqlalchemy.sql.base import (
+        ColumnCollection,
+        Executable,
+        ReadOnlyColumnCollection,
+    )
+    from sqlalchemy.sql.elements import ColumnElement
 def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
@@ -43,7 +47,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
 def convert_rows_custom_column_types(
-    columns: "ReadOnlyColumnCollection[str, KeyedColumnElement[Any]]",
+    columns: "ColumnCollection[str, ColumnElement[Any]]",
     rows: Iterator[tuple[Any, ...]],
     dialect: "Dialect",
 ):

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -42,6 +42,7 @@ if TYPE_CHECKING:
     from sqlalchemy.dialects.sqlite import Insert
     from sqlalchemy.schema import SchemaItem
     from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
+    from sqlalchemy.sql.selectable import Select
     from sqlalchemy.types import TypeEngine
@@ -496,9 +497,6 @@ class SQLiteMetastore(AbstractDBMetastore):
     def _jobs_insert(self) -> "Insert":
         return sqlite.insert(self._jobs)
-    def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
-        raise NotImplementedError("get_possibly_stale_jobs not implemented for SQLite")
 class SQLiteWarehouse(AbstractWarehouse):
     """
@@ -594,7 +592,7 @@ class SQLiteWarehouse(AbstractWarehouse):
     ):
         rows = self.db.execute(select_query, **kwargs)
         yield from convert_rows_custom_column_types(
-            select_query.columns, rows, sqlite_dialect
+            select_query.selected_columns, rows, sqlite_dialect
         )
     def get_dataset_sources(
@@ -708,3 +706,23 @@ class SQLiteWarehouse(AbstractWarehouse):
         client_config=None,
     ) -> list[str]:
         raise NotImplementedError("Exporting dataset table not implemented for SQLite")
+    def create_pre_udf_table(self, query: "Select") -> "Table":
+        """
+        Create a temporary table from a query for use in a UDF.
+        """
+        columns = [
+            sqlalchemy.Column(c.name, c.type)
+            for c in query.selected_columns
+            if c.name != "sys__id"
+        ]
+        table = self.create_udf_table(columns)
+        select_q = query.with_only_columns(
+            *[c for c in query.selected_columns if c.name != "sys__id"]
+        )
+        self.db.execute(
+            table.insert().from_select(list(select_q.selected_columns), select_q)
+        )
+        return table

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -2,6 +2,8 @@ import glob
 import json
 import logging
 import posixpath
+import random
+import string
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Iterable, Iterator, Sequence
 from typing import TYPE_CHECKING, Any, Optional, Union
@@ -24,6 +26,7 @@ from datachain.utils import sql_escape_like
 if TYPE_CHECKING:
     from sqlalchemy.sql._typing import _ColumnsClauseArgument
     from sqlalchemy.sql.elements import ColumnElement
+    from sqlalchemy.sql.selectable import Select
     from sqlalchemy.types import TypeEngine
     from datachain.data_storage import AbstractIDGenerator, schema
@@ -252,6 +255,12 @@ class AbstractWarehouse(ABC, Serializable):
             prefix = self.DATASET_SOURCE_TABLE_PREFIX
         return f"{prefix}{dataset_name}_{version}"
+    def temp_table_name(self) -> str:
+        return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
+    def udf_table_name(self) -> str:
+        return self.UDF_TABLE_NAME_PREFIX + _random_string(6)
     #
     # Datasets
     #
@@ -494,7 +503,7 @@ class AbstractWarehouse(ABC, Serializable):
         This gets nodes based on the provided query, and should be used sparingly,
         as it will be slow on any OLAP database systems.
         """
-        columns = [c.name for c in query.columns]
+        columns = [c.name for c in query.selected_columns]
         for row in self.db.execute(query):
             d = dict(zip(columns, row))
             yield Node(**d)
@@ -869,8 +878,8 @@ class AbstractWarehouse(ABC, Serializable):
     def create_udf_table(
         self,
-        name: str,
         columns: Sequence["sa.Column"] = (),
+        name: Optional[str] = None,
     ) -> "sa.Table":
         """
         Create a temporary table for storing custom signals generated by a UDF.
@@ -878,7 +887,7 @@ class AbstractWarehouse(ABC, Serializable):
         and UDFs are run in other processes when run in parallel.
         """
         tbl = sa.Table(
-            name,
+            name or self.udf_table_name(),
             sa.MetaData(),
             sa.Column("sys__id", Int, primary_key=True),
             *columns,
@@ -886,6 +895,12 @@ class AbstractWarehouse(ABC, Serializable):
         self.db.create_table(tbl, if_not_exists=True)
         return tbl
+    @abstractmethod
+    def create_pre_udf_table(self, query: "Select") -> "Table":
+        """
+        Create a temporary table from a query for use in a UDF.
+        """
     def is_temp_table_name(self, name: str) -> bool:
         """Returns if the given table name refers to a temporary
         or no longer needed table."""
@@ -912,29 +927,6 @@ class AbstractWarehouse(ABC, Serializable):
         for name in names:
             self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
-    def subtract_query(
-        self,
-        source_query: sa.sql.selectable.Select,
-        target_query: sa.sql.selectable.Select,
-    ) -> sa.sql.selectable.Select:
-        sq = source_query.alias("source_query")
-        tq = target_query.alias("target_query")
-        source_target_join = sa.join(
-            sq,
-            tq,
-            (sq.c.source == tq.c.source)
-            & (sq.c.parent == tq.c.parent)
-            & (sq.c.name == tq.c.name),
-            isouter=True,
-        )
-        return (
-            select(*sq.c)
-            .select_from(source_target_join)
-            .where((tq.c.name == None) | (tq.c.name == ""))  # noqa: E711
-        )
     def changed_query(
         self,
         source_query: sa.sql.selectable.Select,
@@ -960,3 +952,10 @@ class AbstractWarehouse(ABC, Serializable):
                 & (tq.c.is_latest == true())
             )
         )
+def _random_string(length: int) -> str:
+    return "".join(
+        random.choice(string.ascii_letters + string.digits)  # noqa: S311
+        for i in range(length)
+    )

datachain/lib/arrow.py CHANGED Viewed

@@ -10,13 +10,17 @@ from datachain.lib.file import File, IndexedFile
 from datachain.lib.udf import Generator
 if TYPE_CHECKING:
+    from pydantic import BaseModel
     from datachain.lib.dc import DataChain
 class ArrowGenerator(Generator):
     def __init__(
         self,
-        schema: Optional["pa.Schema"] = None,
+        input_schema: Optional["pa.Schema"] = None,
+        output_schema: Optional[type["BaseModel"]] = None,
+        source: bool = True,
         nrows: Optional[int] = None,
         **kwargs,
     ):
@@ -25,24 +29,36 @@ class ArrowGenerator(Generator):
         Parameters:
-        schema : Optional pyarrow schema for validation.
+        input_schema : Optional pyarrow schema for validation.
+        output_schema : Optional pydantic model for validation.
+        source : Whether to include info about the source file.
         nrows : Optional row limit.
         kwargs: Parameters to pass to pyarrow.dataset.dataset.
         """
         super().__init__()
-        self.schema = schema
+        self.input_schema = input_schema
+        self.output_schema = output_schema
+        self.source = source
         self.nrows = nrows
         self.kwargs = kwargs
     def process(self, file: File):
         path = file.get_path()
-        ds = dataset(path, filesystem=file.get_fs(), schema=self.schema, **self.kwargs)
+        ds = dataset(
+            path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
+        )
         index = 0
         with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
-            for record_batch in ds.to_batches():
+            for record_batch in ds.to_batches(use_threads=False):
                 for record in record_batch.to_pylist():
-                    source = IndexedFile(file=file, index=index)
-                    yield [source, *record.values()]
+                    vals = list(record.values())
+                    if self.output_schema:
+                        fields = self.output_schema.model_fields
+                        vals = [self.output_schema(**dict(zip(fields, vals)))]
+                    if self.source:
+                        yield [IndexedFile(file=file, index=index), *vals]
+                    else:
+                        yield vals
                     index += 1
                     if self.nrows and index >= self.nrows:
                         return
@@ -76,7 +92,10 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
         if not column:
             column = f"c{default_column}"
             default_column += 1
-        output[column] = _arrow_type_mapper(field.type)  # type: ignore[assignment]
+        dtype = _arrow_type_mapper(field.type)  # type: ignore[assignment]
+        if field.nullable:
+            dtype = Optional[dtype]  # type: ignore[assignment]
+        output[column] = dtype
     return output

datachain/lib/convert/flatten.py CHANGED Viewed

@@ -41,17 +41,22 @@ def flatten_list(obj_list):
     )
+def _flatten_list_field(value: list):
+    assert isinstance(value, list)
+    if value and ModelStore.is_pydantic(type(value[0])):
+        return [val.model_dump() for val in value]
+    if value and isinstance(value[0], list):
+        return [_flatten_list_field(v) for v in value]
+    return value
 def _flatten_fields_values(fields, obj: BaseModel):
     for name, f_info in fields.items():
         anno = f_info.annotation
         # Optimization: Access attributes directly to skip the model_dump() call.
         value = getattr(obj, name)
         if isinstance(value, list):
-            yield [
-                val.model_dump() if ModelStore.is_pydantic(type(val)) else val
-                for val in value
-            ]
+            yield _flatten_list_field(value)
         elif isinstance(value, dict):
             yield {
                 key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val

datachain/lib/convert/python_to_sql.py CHANGED Viewed

@@ -82,7 +82,7 @@ def python_to_sql(typ):  # noqa: PLR0911
 def _is_json_inside_union(orig, args) -> bool:
     if orig == Union and len(args) >= 2:
         # List in JSON: Union[dict, list[dict]]
-        args_no_nones = [arg for arg in args if arg != type(None)]
+        args_no_nones = [arg for arg in args if arg != type(None)]  # noqa: E721
         if len(args_no_nones) == 2:
             args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
             if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:

datachain/lib/convert/values_to_tuples.py CHANGED Viewed

@@ -71,7 +71,10 @@ def values_to_tuples(  # noqa: C901, PLR0912
                     f"signal '{k}' has unsupported type '{typ.__name__}'."
                     f" Please use DataModel types: {DataTypeNames}",
                 )
-            types_map[k] = typ
+            if typ is list:
+                types_map[k] = list[type(v[0][0])]  # type: ignore[misc]
+            else:
+                types_map[k] = typ
         if length < 0:
             length = len_

datachain/lib/data_model.py CHANGED Viewed

@@ -47,7 +47,12 @@ def is_chain_type(t: type) -> bool:
     if any(t is ft or t is get_args(ft)[0] for ft in get_args(StandardType)):
         return True
-    if get_origin(t) is list and len(get_args(t)) == 1:
+    orig = get_origin(t)
+    args = get_args(t)
+    if orig is list and len(args) == 1:
         return is_chain_type(get_args(t)[0])
+    if orig is Union and len(args) == 2 and (type(None) in args):
+        return is_chain_type(args[0])
     return False

datachain 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl

Potentially problematic release.

datachain 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl