PyPI - datachain - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl - Mend

datachain 0.2.16py3-none-any.whl → 0.2.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (19) hide show

datachain/catalog/catalog.py +47 -44
datachain/data_storage/db_engine.py +6 -2
datachain/data_storage/id_generator.py +14 -0
datachain/data_storage/metastore.py +13 -0
datachain/data_storage/sqlite.py +45 -6
datachain/data_storage/warehouse.py +13 -0
datachain/lib/arrow.py +22 -7
datachain/lib/dc.py +29 -6
datachain/lib/file.py +3 -3
datachain/lib/signal_schema.py +33 -5
datachain/listing.py +22 -10
datachain/query/dataset.py +17 -20
datachain/query/session.py +19 -4
{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/METADATA +71 -12
{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/RECORD +19 -19
{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/LICENSE +0 -0
{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/WHEEL +0 -0
{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/entry_points.txt +0 -0
{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -236,36 +236,36 @@ class DatasetRowsFetcher(NodesThreadPool):
         import lz4.frame
         import pandas as pd
-        metastore = self.metastore.clone()  # metastore is not thread safe
-        warehouse = self.warehouse.clone()  # warehouse is not thread safe
-        dataset = metastore.get_dataset(self.dataset_name)
-        urls = list(urls)
-        while urls:
-            for url in urls:
-                if self.should_check_for_status():
-                    self.check_for_status()
-                r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
-                if r.status_code == 404:
-                    time.sleep(PULL_DATASET_SLEEP_INTERVAL)
-                    # moving to the next url
-                    continue
+        # metastore and warehouse are not thread safe
+        with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
+            dataset = metastore.get_dataset(self.dataset_name)
-                r.raise_for_status()
+            urls = list(urls)
+            while urls:
+                for url in urls:
+                    if self.should_check_for_status():
+                        self.check_for_status()
-                df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
+                    r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
+                    if r.status_code == 404:
+                        time.sleep(PULL_DATASET_SLEEP_INTERVAL)
+                        # moving to the next url
+                        continue
-                self.fix_columns(df)
+                    r.raise_for_status()
-                # id will be autogenerated in DB
-                df = df.drop("sys__id", axis=1)
+                    df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
-                inserted = warehouse.insert_dataset_rows(
-                    df, dataset, self.dataset_version
-                )
-                self.increase_counter(inserted)  # type: ignore [arg-type]
-                urls.remove(url)
+                    self.fix_columns(df)
+                    # id will be autogenerated in DB
+                    df = df.drop("sys__id", axis=1)
+                    inserted = warehouse.insert_dataset_rows(
+                        df, dataset, self.dataset_version
+                    )
+                    self.increase_counter(inserted)  # type: ignore [arg-type]
+                    urls.remove(url)
 @dataclass
@@ -720,7 +720,6 @@ class Catalog:
             client.uri, posixpath.join(prefix, "")
         )
         source_metastore = self.metastore.clone(client.uri)
-        source_warehouse = self.warehouse.clone()
         columns = [
             Column("vtype", String),
@@ -1835,25 +1834,29 @@ class Catalog:
         if signed_urls:
             shuffle(signed_urls)
-            rows_fetcher = DatasetRowsFetcher(
-                self.metastore.clone(),
-                self.warehouse.clone(),
-                remote_config,
-                dataset.name,
-                version,
-                schema,
-            )
-            try:
-                rows_fetcher.run(
-                    batched(
-                        signed_urls,
-                        math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
-                    ),
-                    dataset_save_progress_bar,
+            with (
+                self.metastore.clone() as metastore,
+                self.warehouse.clone() as warehouse,
+            ):
+                rows_fetcher = DatasetRowsFetcher(
+                    metastore,
+                    warehouse,
+                    remote_config,
+                    dataset.name,
+                    version,
+                    schema,
                 )
-            except:
-                self.remove_dataset(dataset.name, version)
-                raise
+                try:
+                    rows_fetcher.run(
+                        batched(
+                            signed_urls,
+                            math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
+                        ),
+                        dataset_save_progress_bar,
+                    )
+                except:
+                    self.remove_dataset(dataset.name, version)
+                    raise
         dataset = self.metastore.update_dataset_status(
             dataset,

datachain/data_storage/db_engine.py CHANGED Viewed

@@ -4,7 +4,6 @@ from collections.abc import Iterator
 from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
 import sqlalchemy as sa
-from attrs import frozen
 from sqlalchemy.sql import FROM_LINTING
 from sqlalchemy.sql.roles import DDLRole
@@ -23,13 +22,18 @@ logger = logging.getLogger("datachain")
 SELECT_BATCH_SIZE = 100_000  # number of rows to fetch at a time
-@frozen
 class DatabaseEngine(ABC, Serializable):
     dialect: ClassVar["Dialect"]
     engine: "Engine"
     metadata: "MetaData"
+    def __enter__(self) -> "DatabaseEngine":
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
     @abstractmethod
     def clone(self) -> "DatabaseEngine":
         """Clones DatabaseEngine implementation."""

datachain/data_storage/id_generator.py CHANGED Viewed

@@ -33,6 +33,16 @@ class AbstractIDGenerator(ABC, Serializable):
     def cleanup_for_tests(self):
         """Cleanup for tests."""
+    def close(self) -> None:
+        """Closes any active database connections."""
+    def close_on_exit(self) -> None:
+        """Closes any active database or HTTP connections, called on Session exit or
+        for test cleanup only, as some ID Generator implementations may handle this
+        differently.
+        """
+        self.close()
     @abstractmethod
     def init_id(self, uri: str) -> None:
         """Initializes the ID generator for the given URI with zero last_id."""
@@ -83,6 +93,10 @@ class AbstractDBIDGenerator(AbstractIDGenerator):
     def clone(self) -> "AbstractDBIDGenerator":
         """Clones AbstractIDGenerator implementation."""
+    def close(self) -> None:
+        """Closes any active database connections."""
+        self.db.close()
     @property
     def db(self) -> "DatabaseEngine":
         return self._db

datachain/data_storage/metastore.py CHANGED Viewed

@@ -78,6 +78,13 @@ class AbstractMetastore(ABC, Serializable):
         self.uri = uri
         self.partial_id: Optional[int] = partial_id
+    def __enter__(self) -> "AbstractMetastore":
+        """Returns self upon entering context manager."""
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        """Default behavior is to do nothing, as connections may be shared."""
     @abstractmethod
     def clone(
         self,
@@ -97,6 +104,12 @@ class AbstractMetastore(ABC, Serializable):
     def close(self) -> None:
         """Closes any active database or HTTP connections."""
+    def close_on_exit(self) -> None:
+        """Closes any active database or HTTP connections, called on Session exit or
+        for test cleanup only, as some Metastore implementations may handle this
+        differently."""
+        self.close()
     def cleanup_tables(self, temp_table_names: list[str]) -> None:
         """Cleanup temp tables."""

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -15,7 +15,6 @@ from typing import (
 )
 import sqlalchemy
-from attrs import frozen
 from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
 from sqlalchemy.dialects import sqlite
 from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
@@ -40,6 +39,7 @@ from datachain.utils import DataChainDir
 if TYPE_CHECKING:
     from sqlalchemy.dialects.sqlite import Insert
+    from sqlalchemy.engine.base import Engine
     from sqlalchemy.schema import SchemaItem
     from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
     from sqlalchemy.sql.selectable import Select
@@ -52,6 +52,8 @@ RETRY_START_SEC = 0.01
 RETRY_MAX_TIMES = 10
 RETRY_FACTOR = 2
+DETECT_TYPES = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
 Column = Union[str, "ColumnClause[Any]", "TextClause"]
 datachain.sql.sqlite.setup()
@@ -80,26 +82,41 @@ def retry_sqlite_locks(func):
     return wrapper
-@frozen
 class SQLiteDatabaseEngine(DatabaseEngine):
     dialect = sqlite_dialect
     db: sqlite3.Connection
     db_file: Optional[str]
+    is_closed: bool
+    def __init__(
+        self,
+        engine: "Engine",
+        metadata: "MetaData",
+        db: sqlite3.Connection,
+        db_file: Optional[str] = None,
+    ):
+        self.engine = engine
+        self.metadata = metadata
+        self.db = db
+        self.db_file = db_file
+        self.is_closed = False
     @classmethod
     def from_db_file(cls, db_file: Optional[str] = None) -> "SQLiteDatabaseEngine":
-        detect_types = sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES
+        return cls(*cls._connect(db_file=db_file))
+    @staticmethod
+    def _connect(db_file: Optional[str] = None):
         try:
             if db_file == ":memory:":
                 # Enable multithreaded usage of the same in-memory db
                 db = sqlite3.connect(
-                    "file::memory:?cache=shared", uri=True, detect_types=detect_types
+                    "file::memory:?cache=shared", uri=True, detect_types=DETECT_TYPES
                 )
             else:
                 db = sqlite3.connect(
-                    db_file or DataChainDir.find().db, detect_types=detect_types
+                    db_file or DataChainDir.find().db, detect_types=DETECT_TYPES
                 )
             create_user_defined_sql_functions(db)
             engine = sqlalchemy.create_engine(
@@ -118,7 +135,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
             load_usearch_extension(db)
-            return cls(engine, MetaData(), db, db_file)
+            return engine, MetaData(), db, db_file
         except RuntimeError:
             raise DataChainError("Can't connect to SQLite DB") from None
@@ -138,6 +155,16 @@ class SQLiteDatabaseEngine(DatabaseEngine):
             {},
         )
+    def _reconnect(self) -> None:
+        if not self.is_closed:
+            raise RuntimeError("Cannot reconnect on still-open DB!")
+        engine, metadata, db, db_file = self._connect(db_file=self.db_file)
+        self.engine = engine
+        self.metadata = metadata
+        self.db = db
+        self.db_file = db_file
+        self.is_closed = False
     @retry_sqlite_locks
     def execute(
         self,
@@ -145,6 +172,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
         cursor: Optional[sqlite3.Cursor] = None,
         conn=None,
     ) -> sqlite3.Cursor:
+        if self.is_closed:
+            # Reconnect in case of being closed previously.
+            self._reconnect()
         if cursor is not None:
             result = cursor.execute(*self.compile_to_args(query))
         elif conn is not None:
@@ -179,6 +209,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
     def close(self) -> None:
         self.db.close()
+        self.is_closed = True
     @contextmanager
     def transaction(self):
@@ -359,6 +390,10 @@ class SQLiteMetastore(AbstractDBMetastore):
         self._init_tables()
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        """Close connection upon exit from context manager."""
+        self.close()
     def clone(
         self,
         uri: StorageURI = StorageURI(""),
@@ -521,6 +556,10 @@ class SQLiteWarehouse(AbstractWarehouse):
         self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        """Close connection upon exit from context manager."""
+        self.close()
     def clone(self, use_new_connection: bool = False) -> "SQLiteWarehouse":
         return SQLiteWarehouse(self.id_generator.clone(), db=self.db.clone())

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -70,6 +70,13 @@ class AbstractWarehouse(ABC, Serializable):
     def __init__(self, id_generator: "AbstractIDGenerator"):
         self.id_generator = id_generator
+    def __enter__(self) -> "AbstractWarehouse":
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        # Default behavior is to do nothing, as connections may be shared.
+        pass
     def cleanup_for_tests(self):
         """Cleanup for tests."""
@@ -158,6 +165,12 @@ class AbstractWarehouse(ABC, Serializable):
         """Closes any active database connections."""
         self.db.close()
+    def close_on_exit(self) -> None:
+        """Closes any active database or HTTP connections, called on Session exit or
+        for test cleanup only, as some Warehouse implementations may handle this
+        differently."""
+        self.close()
     #
     # Query Tables
     #

datachain/lib/arrow.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import re
 from collections.abc import Sequence
+from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING, Optional
 import pyarrow as pa
@@ -43,13 +44,17 @@ class ArrowGenerator(Generator):
         self.kwargs = kwargs
     def process(self, file: File):
-        path = file.get_path()
-        ds = dataset(
-            path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
-        )
+        if self.nrows:
+            path = _nrows_file(file, self.nrows)
+            ds = dataset(path, schema=self.input_schema, **self.kwargs)
+        else:
+            path = file.get_path()
+            ds = dataset(
+                path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
+            )
         index = 0
         with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
-            for record_batch in ds.to_batches(use_threads=False):
+            for record_batch in ds.to_batches():
                 for record in record_batch.to_pylist():
                     vals = list(record.values())
                     if self.output_schema:
@@ -60,8 +65,6 @@ class ArrowGenerator(Generator):
                     else:
                         yield vals
                     index += 1
-                    if self.nrows and index >= self.nrows:
-                        return
                 pbar.update(len(record_batch))
@@ -125,3 +128,15 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type:  # noqa: PLR0911
     if isinstance(col_type, pa.lib.DictionaryType):
         return _arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
     raise TypeError(f"{col_type!r} datatypes not supported")
+def _nrows_file(file: File, nrows: int) -> str:
+    tf = NamedTemporaryFile(delete=False)
+    with file.open(mode="r") as reader:
+        with open(tf.name, "a") as writer:
+            for row, line in enumerate(reader):
+                if row >= nrows:
+                    break
+                writer.write(line)
+                writer.write("\n")
+    return tf.name

datachain/lib/dc.py CHANGED Viewed

@@ -829,8 +829,19 @@ class DataChain(DatasetQuery):
         )
         ```
         """
-        chain = super().mutate(**kwargs)
-        chain.signals_schema = self.signals_schema.mutate(kwargs)
+        mutated = {}
+        schema = self.signals_schema
+        for name, value in kwargs.items():
+            if isinstance(value, Column):
+                # renaming existing column
+                for signal in schema.db_signals(name=value.name, as_columns=True):
+                    mutated[signal.name.replace(value.name, name, 1)] = signal
+            else:
+                # adding new signal
+                mutated[name] = value
+        chain = super().mutate(**mutated)
+        chain.signals_schema = schema.mutate(kwargs)
         return chain
     @property
@@ -1099,7 +1110,7 @@ class DataChain(DatasetQuery):
             )
         else:
             signals = self.signals_schema.resolve(*on).db_signals()
-        return super()._subtract(other, signals)
+        return super()._subtract(other, signals)  # type: ignore[arg-type]
     @classmethod
     def from_values(
@@ -1261,8 +1272,21 @@ class DataChain(DatasetQuery):
             dc = dc.parse_tabular(format="json")
             ```
         """
+        from pyarrow.dataset import CsvFileFormat, JsonFileFormat
         from datachain.lib.arrow import ArrowGenerator, infer_schema, schema_to_output
+        if nrows:
+            format = kwargs.get("format")
+            if format not in ["csv", "json"] and not isinstance(
+                format, (CsvFileFormat, JsonFileFormat)
+            ):
+                raise DatasetPrepareError(
+                    self.name,
+                    "error in `parse_tabular` - "
+                    "`nrows` only supported for csv and json formats.",
+                )
         schema = None
         col_names = output if isinstance(output, Sequence) else None
         if col_names or not output:
@@ -1360,6 +1384,8 @@ class DataChain(DatasetQuery):
             else:
                 msg = f"error parsing csv - incompatible output type {type(output)}"
                 raise DatasetPrepareError(chain.name, msg)
+        elif nrows:
+            nrows += 1
         parse_options = ParseOptions(delimiter=delimiter)
         read_options = ReadOptions(column_names=column_names)
@@ -1382,7 +1408,6 @@ class DataChain(DatasetQuery):
         object_name: str = "",
         model_name: str = "",
         source: bool = True,
-        nrows=None,
         **kwargs,
     ) -> "DataChain":
         """Generate chain from parquet files.
@@ -1395,7 +1420,6 @@ class DataChain(DatasetQuery):
             object_name : Created object column name.
             model_name : Generated model name.
             source : Whether to include info about the source file.
-            nrows : Optional row limit.
         Example:
             Reading a single file:
@@ -1414,7 +1438,6 @@ class DataChain(DatasetQuery):
             object_name=object_name,
             model_name=model_name,
             source=source,
-            nrows=None,
             format="parquet",
             partitioning=partitioning,
         )

datachain/lib/file.py CHANGED Viewed

@@ -317,9 +317,9 @@ class TextFile(File):
     """`DataModel` for reading text files."""
     @contextmanager
-    def open(self):
-        """Open the file and return a file object in text mode."""
-        with super().open(mode="r") as stream:
+    def open(self, mode: Literal["rb", "r"] = "r"):
+        """Open the file and return a file object (default to text mode)."""
+        with super().open(mode=mode) as stream:
             yield stream
     def read_text(self):

datachain/lib/signal_schema.py CHANGED Viewed

@@ -25,7 +25,7 @@ from datachain.lib.data_model import DataModel, DataType
 from datachain.lib.file import File
 from datachain.lib.model_store import ModelStore
 from datachain.lib.utils import DataChainParamsError
-from datachain.query.schema import DEFAULT_DELIMITER
+from datachain.query.schema import DEFAULT_DELIMITER, Column
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
@@ -222,13 +222,30 @@ class SignalSchema:
                 res.append(obj)
         return res
-    def db_signals(self) -> list[str]:
-        return [
+    def db_signals(
+        self, name: Optional[str] = None, as_columns=False
+    ) -> Union[list[str], list[Column]]:
+        """
+        Returns DB columns as strings or Column objects with proper types
+        Optionally, it can filter results by specific object, returning only his signals
+        """
+        signals = [
             DEFAULT_DELIMITER.join(path)
-            for path, _, has_subtree, _ in self.get_flat_tree()
+            if not as_columns
+            else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
+            for path, _type, has_subtree, _ in self.get_flat_tree()
             if not has_subtree
         ]
+        if name:
+            signals = [
+                s
+                for s in signals
+                if str(s) == name or str(s).startswith(f"{name}{DEFAULT_DELIMITER}")
+            ]
+        return signals  # type: ignore[return-value]
     def resolve(self, *names: str) -> "SignalSchema":
         schema = {}
         for field in names:
@@ -282,7 +299,18 @@ class SignalSchema:
         return SignalSchema(schema)
     def mutate(self, args_map: dict) -> "SignalSchema":
-        return SignalSchema(self.values | sql_to_python(args_map))
+        new_values = self.values.copy()
+        for name, value in args_map.items():
+            if isinstance(value, Column) and value.name in self.values:
+                # renaming existing signal
+                del new_values[value.name]
+                new_values[name] = self.values[value.name]
+            else:
+                # adding new signal
+                new_values.update(sql_to_python({name: value}))
+        return SignalSchema(new_values)
     def clone_without_sys_signals(self) -> "SignalSchema":
         schema = copy.deepcopy(self.values)

datachain/listing.py CHANGED Viewed

@@ -44,6 +44,16 @@ class Listing:
             self.dataset,
         )
+    def __enter__(self) -> "Listing":
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
+    def close(self) -> None:
+        self.metastore.close()
+        self.warehouse.close()
     @property
     def id(self):
         return self.storage.id
@@ -56,16 +66,18 @@ class Listing:
         sync(get_loop(), self._fetch, start_prefix, method)
     async def _fetch(self, start_prefix: str, method: str) -> None:
-        self = self.clone()
-        if start_prefix:
-            start_prefix = start_prefix.rstrip("/")
-        try:
-            async for entries in self.client.scandir(start_prefix, method=method):
-                self.insert_entries(entries)
-                if len(entries) > 1:
-                    self.metastore.update_last_inserted_at()
-        finally:
-            self.insert_entries_done()
+        with self.clone() as fetch_listing:
+            if start_prefix:
+                start_prefix = start_prefix.rstrip("/")
+            try:
+                async for entries in fetch_listing.client.scandir(
+                    start_prefix, method=method
+                ):
+                    fetch_listing.insert_entries(entries)
+                    if len(entries) > 1:
+                        fetch_listing.metastore.update_last_inserted_at()
+            finally:
+                fetch_listing.insert_entries_done()
     def insert_entry(self, entry: Entry) -> None:
         self.warehouse.insert_rows(

datachain/query/dataset.py CHANGED Viewed

@@ -1051,8 +1051,11 @@ class DatasetQuery:
         if anon:
             client_config["anon"] = True
+        self.session = Session.get(
+            session, catalog=catalog, client_config=client_config
+        )
+        self.catalog = catalog or self.session.catalog
         self.steps: list[Step] = []
-        self.catalog = catalog or get_catalog(client_config=client_config)
         self._chunk_index: Optional[int] = None
         self._chunk_total: Optional[int] = None
         self.temp_table_names: list[str] = []
@@ -1063,7 +1066,6 @@ class DatasetQuery:
         self.version: Optional[int] = None
         self.feature_schema: Optional[dict] = None
         self.column_types: Optional[dict[str, Any]] = None
-        self.session = Session.get(session, catalog=catalog)
         if path:
             kwargs = {"update": True} if update else {}
@@ -1200,12 +1202,10 @@ class DatasetQuery:
         # This is needed to always use a new connection with all metastore and warehouse
         # implementations, as errors may close or render unusable the existing
         # connections.
-        metastore = self.catalog.metastore.clone(use_new_connection=True)
-        metastore.cleanup_tables(self.temp_table_names)
-        metastore.close()
-        warehouse = self.catalog.warehouse.clone(use_new_connection=True)
-        warehouse.cleanup_tables(self.temp_table_names)
-        warehouse.close()
+        with self.catalog.metastore.clone(use_new_connection=True) as metastore:
+            metastore.cleanup_tables(self.temp_table_names)
+        with self.catalog.warehouse.clone(use_new_connection=True) as warehouse:
+            warehouse.cleanup_tables(self.temp_table_names)
         self.temp_table_names = []
     def db_results(self, row_factory=None, **kwargs):
@@ -1248,19 +1248,12 @@ class DatasetQuery:
             def row_iter() -> Generator[RowDict, None, None]:
                 # warehouse isn't threadsafe, we need to clone() it
                 # in the thread that uses the results
-                warehouse = None
-                try:
-                    warehouse = self.catalog.warehouse.clone()
+                with self.catalog.warehouse.clone() as warehouse:
                     gen = warehouse.dataset_select_paginated(
                         query, limit=query._limit, order_by=query._order_by_clauses
                     )
                     with contextlib.closing(gen) as rows:
                         yield from rows
-                finally:
-                    # clone doesn't necessarily create a new connection
-                    # we can't do `warehouse.close()` for now. It is a bad design
-                    # in clone / close interface that needs to be fixed.
-                    pass
             async def get_params(row: RowDict) -> tuple:
                 return tuple(
@@ -1383,10 +1376,14 @@ class DatasetQuery:
     @detach
     def limit(self, n: int) -> "Self":
         query = self.clone(new_table=False)
-        for step in query.steps:
-            if isinstance(step, SQLLimit) and step.n < n:
-                return query
-        query.steps.append(SQLLimit(n))
+        if (
+            query.steps
+            and (last_step := query.steps[-1])
+            and isinstance(last_step, SQLLimit)
+        ):
+            query.steps[-1] = SQLLimit(min(n, last_step.n))
+        else:
+            query.steps.append(SQLLimit(n))
         return query
     @detach

datachain/query/session.py CHANGED Viewed

@@ -41,7 +41,12 @@ class Session:
     SESSION_UUID_LEN = 6
     TEMP_TABLE_UUID_LEN = 6
-    def __init__(self, name="", catalog: Optional["Catalog"] = None):
+    def __init__(
+        self,
+        name="",
+        catalog: Optional["Catalog"] = None,
+        client_config: Optional[dict] = None,
+    ):
         if re.match(r"^[0-9a-zA-Z]+$", name) is None:
             raise ValueError(
                 f"Session name can contain only letters or numbers - '{name}' given."
@@ -52,13 +57,18 @@ class Session:
         session_uuid = uuid4().hex[: self.SESSION_UUID_LEN]
         self.name = f"{name}_{session_uuid}"
-        self.catalog = catalog or get_catalog()
+        self.is_new_catalog = not catalog
+        self.catalog = catalog or get_catalog(client_config=client_config)
     def __enter__(self):
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
         self._cleanup_temp_datasets()
+        if self.is_new_catalog:
+            self.catalog.metastore.close_on_exit()
+            self.catalog.warehouse.close_on_exit()
+            self.catalog.id_generator.close_on_exit()
     def generate_temp_dataset_name(self) -> str:
         tmp_table_uid = uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
@@ -75,7 +85,10 @@ class Session:
     @classmethod
     def get(
-        cls, session: Optional["Session"] = None, catalog: Optional["Catalog"] = None
+        cls,
+        session: Optional["Session"] = None,
+        catalog: Optional["Catalog"] = None,
+        client_config: Optional[dict] = None,
     ) -> "Session":
         """Creates a Session() object from a catalog.
@@ -88,7 +101,9 @@ class Session:
             return session
         if cls.GLOBAL_SESSION is None:
-            cls.GLOBAL_SESSION_CTX = Session(cls.GLOBAL_SESSION_NAME, catalog)
+            cls.GLOBAL_SESSION_CTX = Session(
+                cls.GLOBAL_SESSION_NAME, catalog, client_config=client_config
+            )
             cls.GLOBAL_SESSION = cls.GLOBAL_SESSION_CTX.__enter__()
             atexit.register(cls._global_cleanup)
         return cls.GLOBAL_SESSION

{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.2.16
+Version: 0.2.17
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -100,28 +100,87 @@ Requires-Dist: usearch ; extra == 'vector'
 AI 🔗 DataChain
 ----------------
-DataChain is an open-source Python library for processing and curating unstructured
-data at scale.
+DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
+AI engineers build a metadata layer on top of unstructured files and analyze data using
+this layer.
-🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
+📂 **Raw Files Processing**
+   Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
+   Local), version and update datasets.
-🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
+🌟 **Metadata layer.**
+   Build a metadata layer on top of files using structured sources like CSV, Parquet,
+   and JSON files.
-🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
+⭐ **Metadata enrichment.**
+   Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
+🛠️ **Data Transformation.**
+   Transform metadata using traditional methods like filtering, grouping, joining, and
+   others.
-Datachain supports parallel processing, parallel data
-downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
-The typical use cases include Computer Vision data curation, LLM analytics,
-and validation of multimodal AI applications.
+🐍 **User-friendly interface.**
+   Operate efficiently with familiar Python objects and object fields, eliminating the
+   need for SQL.
 .. code:: console
    $ pip install datachain
-|Flowchart|
+Data Structures
+===============
+DataChain introduces expressive data structures tailored for AI-specific workload:
+- **Dataset:** Preserves the file-references and meta-information. Takes care of Python
+  object serialization, dataset versioning and difference. Operations on dataset:
+  - **Transformations:** traditional data-frame or SQL operations such as filtering,
+    grouping, joining.
+  - **Enrichments:** mapping, aggregating and generating using customer’s Python
+    code. This is needed to work with ML inference and LLM calls.
+- **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
+  mode - only when needed.
+DataChain name comes from these major data structures: dataset and chaining.
+What’s new in DataChain?
+========================
+The project combines multiple ideas from different areas in order to simplify AI
+use-cases and at the same time to fit it into traditional data infrastructure.
+- **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
+  native language for AI. It’s powered by `Pydantic`_ data models.
+- **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
+  group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
+  needed for distributed computations.
+- **Resuming data processing** (in development). Introduces idempotent operations,
+  allowing data processing to resume from the last successful process file/record/batch
+  if it fails due to issues like failed LLM calls, ML inference or file download.
+Additional relatively new ideas:
+- **Functional style data processing.** Using a functional/chaining approach to data
+  processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
+- **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
+  and implements data versioning, extending ideas from DVC (developed by the same team).
+What DataChain is NOT?
+======================
+- **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
+  `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
+  version.
+- **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
+  it delegates heavy data transformations to underlying data warehouses and focuses on
+  AI specific data enrichments and orchestrating all the pieces together.
 Quick Start
 -----------

{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/RECORD RENAMED Viewed

@@ -8,7 +8,7 @@ datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
 datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
 datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
 datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
-datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
+datachain/listing.py,sha256=JEhi5WOSV2LUqRQgt0-fdmJ8Zb5fNpNFzBQcuTtx63o,8555
 datachain/node.py,sha256=LwzSOSM9SbPLI5RvYDsiEkk7d5rbMX8huzM_m7uWKx4,5917
 datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
 datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
 datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=LZo9pIgi_HOUWpxX1c7RMt5OnrlDHXx2YpL5oP8X0kk,80397
+datachain/catalog/catalog.py,sha256=z0tclel0kNdSzJojNRRnRVhgt-K7ElO3CeuurlwQMGI,80612
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
 datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -29,27 +29,27 @@ datachain/client/gcs.py,sha256=ucX8e6JrqlFY-f80zkv084vxnKdtxpO32QJ-RG8Nv1s,4454
 datachain/client/local.py,sha256=NQVkLTJQ-a7Udavqbh_4uT-IejfZQYn10j22owz9sis,5150
 datachain/client/s3.py,sha256=TmW4f7VUM5CMZjSmgyFQFKeMUGrXt2SLoLEbLOUleiU,6296
 datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
-datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
-datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
+datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
+datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
 datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
-datachain/data_storage/metastore.py,sha256=ody-hWyrisGuNlzy24bc7QBqPXWIg64NcucIhZYronk,54842
+datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz2ftEz0,55362
 datachain/data_storage/schema.py,sha256=FQvt5MUMSnI5ZAE7Nthae4aaJpt8JC4nH8KiWDuhJkk,8135
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
-datachain/data_storage/sqlite.py,sha256=w0d_cZ2u9LpQYFFXll22mnxHaxPOoJdHlsKAZmONQpA,25605
-datachain/data_storage/warehouse.py,sha256=3iD946WXgGxohZ5lagmwydFZr7j7RceZW423QXU_7_U,33120
+datachain/data_storage/sqlite.py,sha256=0r6L_a2hdGRoR_gl06v1qWhEFOS_Q31aldHyk07Yx-M,26857
+datachain/data_storage/warehouse.py,sha256=G79jsQwA6anYPWoiBXngwPyx-uP7yGIWqhZGc4TL5mY,33591
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/lib/arrow.py,sha256=9C5AVH6tLo9hwzav-1tLLnmWP-3_SReYCOfcOC54pu0,4437
+datachain/lib/arrow.py,sha256=R8wDUDEa-5hYjI3HW9cqvOYYJpeeah5lbhFIL3gkmcE,4915
 datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
 datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
 datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
-datachain/lib/dc.py,sha256=6RtwA7MC3hosxi9RBgpOXjkv46SdN99g9N_u4mCDUUo,56071
-datachain/lib/file.py,sha256=n9GBmZ1CjzDjHkbUBsUrs8JOJrAoh3MV2Cc8hBkex20,11957
+datachain/lib/dc.py,sha256=bZx7VJ389SJ5gRTkckFD044LHq_hOgHqvhTD7gJoBZY,56963
+datachain/lib/file.py,sha256=MCklths3w9SgQTR0LACnDohfGdEc3t30XD0qNq1oTlI,12000
 datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
 datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
 datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
 datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
 datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
-datachain/lib/signal_schema.py,sha256=XQTINSN_FJK76Jn8qd03g6J0cum58knP8U7Iuw-zKyU,14704
+datachain/lib/signal_schema.py,sha256=VL9TR0CJ3eRzjIDr-8e-e7cZKuMBbPUZtY2lGAsucc0,15734
 datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
 datachain/lib/udf.py,sha256=IjuDt2B8E3xEHhcJnaK_ZhmivdrOYPXz5uf7ylpktws,11815
 datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
@@ -66,12 +66,12 @@ datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffO
 datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
 datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
 datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
-datachain/query/dataset.py,sha256=iTz3c5nJ-WmoQ5zcvKGT9ly6xVKJtD_fk76LA7zecWk,60164
+datachain/query/dataset.py,sha256=-AGkz3-K_b-2YBJCMqQz-Qq7FKzMcScPty_77S0AQtE,59938
 datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
 datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
 datachain/query/schema.py,sha256=hAvux_GxUmuG_PwtnKkkizld9f0Gvt2JBzbu3m74fvE,7840
-datachain/query/session.py,sha256=am4XCNj8NlZPAYJSvh43C13dQ5NsfzzuyVDjPgYAgJE,3655
+datachain/query/session.py,sha256=qTzkXgwMJdJhal3rVt3hdv3x1EXT1IHuXcwkC-Ex0As,4111
 datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
 datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
@@ -92,9 +92,9 @@ datachain/sql/sqlite/base.py,sha256=Jb1csbIARjEvwbylnvgNA7ChozSyoL3CQzOGBUf8QAw,
 datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.2.16.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.2.16.dist-info/METADATA,sha256=1f326fK-ZnS0nPvETuUj9PaI4R5SatpGVDIsQiJ0OvM,14577
-datachain-0.2.16.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-datachain-0.2.16.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.2.16.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.2.16.dist-info/RECORD,,
+datachain-0.2.17.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.2.17.dist-info/METADATA,sha256=STR0-4R9NOW55GgadrPA_-fmx5-WckcwhTmyH_OgaUs,17269
+datachain-0.2.17.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+datachain-0.2.17.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.2.17.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.2.17.dist-info/RECORD,,

{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.2.16.dist-info → datachain-0.2.17.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl

Potentially problematic release.

datachain 0.2.16py3-none-any.whl → 0.2.17py3-none-any.whl