PyPI - datachain - Versions diffs - 0.36.0__py3-none-any.whl → 0.36.1__py3-none-any.whl - Mend

datachain 0.36.0py3-none-any.whl → 0.36.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (15) hide show

datachain/data_storage/schema.py +1 -2
datachain/data_storage/sqlite.py +2 -9
datachain/data_storage/warehouse.py +50 -33
datachain/diff/__init__.py +2 -6
datachain/lib/audio.py +54 -53
datachain/lib/dc/datachain.py +13 -14
datachain/query/dataset.py +21 -26
datachain/query/dispatch.py +64 -42
datachain/query/queue.py +2 -1
{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/METADATA +3 -2
{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/RECORD +15 -15
{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/WHEEL +0 -0
{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/entry_points.txt +0 -0
{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/licenses/LICENSE +0 -0
{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/top_level.txt +0 -0

datachain/data_storage/schema.py CHANGED Viewed

@@ -11,7 +11,6 @@ from datachain.sql.types import (
     JSON,
     Boolean,
     DateTime,
-    Int,
     Int64,
     SQLType,
     String,
@@ -269,7 +268,7 @@ class DataTable:
     @classmethod
     def sys_columns(cls):
         return [
-            sa.Column("sys__id", Int, primary_key=True),
+            sa.Column("sys__id", UInt64, primary_key=True),
             sa.Column(
                 "sys__rand", UInt64, nullable=False, server_default=f.abs(f.random())
             ),

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -868,11 +868,8 @@ class SQLiteWarehouse(AbstractWarehouse):
                 if isinstance(c, BinaryExpression):
                     right_left_join = add_left_rows_filter(c)
-        # Use CTE instead of subquery to force SQLite to materialize the result
-        # This breaks deep nesting and prevents parser stack overflow.
         union_cte = sqlalchemy.union(left_right_join, right_left_join).cte()
-        return self._regenerate_system_columns(union_cte)
+        return sqlalchemy.select(*union_cte.c).select_from(union_cte)
     def _system_row_number_expr(self):
         return func.row_number().over()
@@ -884,11 +881,7 @@ class SQLiteWarehouse(AbstractWarehouse):
         """
         Create a temporary table from a query for use in a UDF.
         """
-        columns = [
-            sqlalchemy.Column(c.name, c.type)
-            for c in query.selected_columns
-            if c.name != "sys__id"
-        ]
+        columns = [sqlalchemy.Column(c.name, c.type) for c in query.selected_columns]
         table = self.create_udf_table(columns)
         with tqdm(desc="Preparing", unit=" rows", leave=False) as pbar:

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -5,7 +5,7 @@ import random
 import string
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
-from typing import TYPE_CHECKING, Any, Union
+from typing import TYPE_CHECKING, Any, Union, cast
 from urllib.parse import urlparse
 import attrs
@@ -23,7 +23,7 @@ from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
 from datachain.query.batch import RowsOutput
 from datachain.query.schema import ColumnMeta
 from datachain.sql.functions import path as pathfunc
-from datachain.sql.types import Int, SQLType
+from datachain.sql.types import SQLType
 from datachain.utils import sql_escape_like
 if TYPE_CHECKING:
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
         _FromClauseArgument,
         _OnClauseArgument,
     )
+    from sqlalchemy.sql.selectable import FromClause
     from sqlalchemy.types import TypeEngine
     from datachain.data_storage import schema
@@ -248,45 +249,56 @@ class AbstractWarehouse(ABC, Serializable):
     def _regenerate_system_columns(
         self,
-        selectable: sa.Select | sa.CTE,
+        selectable: sa.Select,
         keep_existing_columns: bool = False,
+        regenerate_columns: Iterable[str] | None = None,
     ) -> sa.Select:
         """
-        Return a SELECT that regenerates sys__id and sys__rand deterministically.
+        Return a SELECT that regenerates system columns deterministically.
-        If keep_existing_columns is True, existing sys__id and sys__rand columns
-        will be kept as-is if they exist in the input selectable.
-        """
-        base = selectable.subquery() if hasattr(selectable, "subquery") else selectable
-        result_columns: dict[str, sa.ColumnElement] = {}
-        for col in base.c:
-            if col.name in result_columns:
-                raise ValueError(f"Duplicate column name {col.name} in SELECT")
-            if col.name in ("sys__id", "sys__rand"):
-                if keep_existing_columns:
-                    result_columns[col.name] = col
-            else:
-                result_columns[col.name] = col
+        If keep_existing_columns is True, existing system columns will be kept as-is
+        even when they are listed in ``regenerate_columns``.
-        system_types: dict[str, sa.types.TypeEngine] = {
+        Args:
+            selectable: Base SELECT
+            keep_existing_columns: When True, reuse existing system columns even if
+                they are part of the regeneration set.
+            regenerate_columns: Names of system columns to regenerate. Defaults to
+                {"sys__id", "sys__rand"}. Columns not listed are left untouched.
+        """
+        system_columns = {
             sys_col.name: sys_col.type
             for sys_col in self.schema.dataset_row_cls.sys_columns()
         }
+        regenerate = set(regenerate_columns or system_columns)
+        generators = {
+            "sys__id": self._system_row_number_expr,
+            "sys__rand": self._system_random_expr,
+        }
+        base = cast("FromClause", selectable.subquery())
+        def build(name: str) -> sa.ColumnElement:
+            expr = generators[name]()
+            return sa.cast(expr, system_columns[name]).label(name)
+        columns: list[sa.ColumnElement] = []
+        present: set[str] = set()
+        changed = False
+        for col in base.c:
+            present.add(col.name)
+            regen = col.name in regenerate and not keep_existing_columns
+            columns.append(build(col.name) if regen else col)
+            changed |= regen
+        for name in regenerate - present:
+            columns.append(build(name))
+            changed = True
+        if not changed:
+            return selectable
-        # Add missing system columns if needed
-        if "sys__id" not in result_columns:
-            expr = self._system_row_number_expr()
-            expr = sa.cast(expr, system_types["sys__id"])
-            result_columns["sys__id"] = expr.label("sys__id")
-        if "sys__rand" not in result_columns:
-            expr = self._system_random_expr()
-            expr = sa.cast(expr, system_types["sys__rand"])
-            result_columns["sys__rand"] = expr.label("sys__rand")
-        # Wrap in subquery to materialize window functions, then wrap again in SELECT
-        # This ensures window functions are computed before INSERT...FROM SELECT
-        columns = list(result_columns.values())
         inner = sa.select(*columns).select_from(base).subquery()
         return sa.select(*inner.c).select_from(inner)
@@ -950,10 +962,15 @@ class AbstractWarehouse(ABC, Serializable):
         SQLite TEMPORARY tables cannot be directly used as they are process-specific,
         and UDFs are run in other processes when run in parallel.
         """
+        columns = [
+            c
+            for c in columns
+            if c.name not in [col.name for col in self.dataset_row_cls.sys_columns()]
+        ]
         tbl = sa.Table(
             name or self.udf_table_name(),
             sa.MetaData(),
-            sa.Column("sys__id", Int, primary_key=True),
+            *self.dataset_row_cls.sys_columns(),
             *columns,
         )
         self.db.create_table(tbl, if_not_exists=True)

datachain/diff/__init__.py CHANGED Viewed

@@ -24,7 +24,7 @@ class CompareStatus(str, Enum):
     SAME = "S"
-def _compare(  # noqa: C901, PLR0912
+def _compare(  # noqa: C901
     left: "DataChain",
     right: "DataChain",
     on: str | Sequence[str],
@@ -151,11 +151,7 @@ def _compare(  # noqa: C901, PLR0912
     if status_col:
         cols_select.append(diff_col)
-    if not dc_diff._sys:
-        # TODO workaround when sys signal is not available in diff
-        dc_diff = dc_diff.settings(sys=True).select(*cols_select).settings(sys=False)
-    else:
-        dc_diff = dc_diff.select(*cols_select)
+    dc_diff = dc_diff.select(*cols_select)
     # final schema is schema from the left chain with status column added if needed
     dc_diff.signals_schema = (

datachain/lib/audio.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import posixpath
+import re
 from typing import TYPE_CHECKING
 from datachain.lib.file import FileError
@@ -9,7 +10,7 @@ if TYPE_CHECKING:
     from datachain.lib.file import Audio, AudioFile, File
 try:
-    import torchaudio
+    import soundfile as sf
 except ImportError as exc:
     raise ImportError(
         "Missing dependencies for processing audio.\n"
@@ -26,18 +27,25 @@ def audio_info(file: "File | AudioFile") -> "Audio":
     try:
         with file.open() as f:
-            info = torchaudio.info(f)
+            info = sf.info(f)
+            sample_rate = int(info.samplerate)
+            channels = int(info.channels)
+            frames = int(info.frames)
+            duration = float(info.duration)
-            sample_rate = int(info.sample_rate)
-            channels = int(info.num_channels)
-            frames = int(info.num_frames)
-            duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
+            # soundfile provides format and subtype
+            if info.format:
+                format_name = info.format.lower()
+            else:
+                format_name = file.get_file_ext().lower()
-            codec_name = getattr(info, "encoding", "")
-            file_ext = file.get_file_ext().lower()
-            format_name = _encoding_to_format(codec_name, file_ext)
+            if not format_name:
+                format_name = "unknown"
+            codec_name = info.subtype if info.subtype else ""
-            bits_per_sample = getattr(info, "bits_per_sample", 0)
+            # Calculate bit rate from subtype
+            bits_per_sample = _get_bits_per_sample(info.subtype)
             bit_rate = (
                 bits_per_sample * sample_rate * channels if bits_per_sample > 0 else -1
             )
@@ -58,44 +66,39 @@ def audio_info(file: "File | AudioFile") -> "Audio":
     )
-def _encoding_to_format(encoding: str, file_ext: str) -> str:
+def _get_bits_per_sample(subtype: str) -> int:
     """
-    Map torchaudio encoding to a format name.
+    Map soundfile subtype to bits per sample.
     Args:
-        encoding: The encoding string from torchaudio.info()
-        file_ext: The file extension as a fallback
+        subtype: The subtype string from soundfile
     Returns:
-        Format name as a string
+        Bits per sample, or 0 if unknown
     """
-    # Direct mapping for formats that match exactly
-    encoding_map = {
-        "FLAC": "flac",
-        "MP3": "mp3",
-        "VORBIS": "ogg",
-        "AMR_WB": "amr",
-        "AMR_NB": "amr",
-        "OPUS": "opus",
-        "GSM": "gsm",
+    if not subtype:
+        return 0
+    # Common PCM and floating-point subtypes
+    pcm_bits = {
+        "PCM_16": 16,
+        "PCM_24": 24,
+        "PCM_32": 32,
+        "PCM_S8": 8,
+        "PCM_U8": 8,
+        "FLOAT": 32,
+        "DOUBLE": 64,
     }
-    if encoding in encoding_map:
-        return encoding_map[encoding]
+    if subtype in pcm_bits:
+        return pcm_bits[subtype]
-    # For PCM variants, use file extension to determine format
-    if encoding.startswith("PCM_"):
-        # Common PCM formats by extension
-        pcm_formats = {
-            "wav": "wav",
-            "aiff": "aiff",
-            "au": "au",
-            "raw": "raw",
-        }
-        return pcm_formats.get(file_ext, "wav")  # Default to wav for PCM
+    # Handle variants such as PCM_S16LE, PCM_F32LE, etc.
+    match = re.search(r"PCM_(?:[A-Z]*?)(\d+)", subtype)
+    if match:
+        return int(match.group(1))
-    # Fallback to file extension if encoding is unknown
-    return file_ext if file_ext else "unknown"
+    return 0
 def audio_to_np(
@@ -114,27 +117,27 @@ def audio_to_np(
     try:
         with audio.open() as f:
-            info = torchaudio.info(f)
-            sample_rate = info.sample_rate
+            info = sf.info(f)
+            sample_rate = info.samplerate
             frame_offset = int(start * sample_rate)
             num_frames = int(duration * sample_rate) if duration is not None else -1
             # Reset file pointer to the beginning
-            # This is important to ensure we read from the correct position later
             f.seek(0)
-            waveform, sr = torchaudio.load(
-                f, frame_offset=frame_offset, num_frames=num_frames
+            # Read audio data with offset and frame count
+            audio_np, sr = sf.read(
+                f,
+                start=frame_offset,
+                frames=num_frames,
+                always_2d=False,
+                dtype="float32",
             )
-            audio_np = waveform.numpy()
-            if audio_np.shape[0] > 1:
-                audio_np = audio_np.T
-            else:
-                audio_np = audio_np.squeeze()
+            # soundfile returns shape (frames,) for mono or
+            # (frames, channels) for multi-channel
+            # We keep this format as it matches expected output
             return audio_np, int(sr)
     except Exception as exc:
         raise FileError(
@@ -152,11 +155,9 @@ def audio_to_bytes(
     If duration is None, converts from start to end of file.
     If start is 0 and duration is None, converts entire file."""
-    y, sr = audio_to_np(audio, start, duration)
     import io
-    import soundfile as sf
+    y, sr = audio_to_np(audio, start, duration)
     buffer = io.BytesIO()
     sf.write(buffer, y, sr, format=format)

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -856,7 +856,9 @@ class DataChain:
                 udf_obj.to_udf_wrapper(self._settings.batch_size),
                 **self._settings.to_dict(),
             ),
-            signal_schema=self.signals_schema | udf_obj.output,
+            signal_schema=SignalSchema({"sys": Sys})
+            | self.signals_schema
+            | udf_obj.output,
         )
     def gen(
@@ -894,7 +896,7 @@ class DataChain:
                 udf_obj.to_udf_wrapper(self._settings.batch_size),
                 **self._settings.to_dict(),
             ),
-            signal_schema=udf_obj.output,
+            signal_schema=SignalSchema({"sys": Sys}) | udf_obj.output,
         )
     @delta_disabled
@@ -1031,7 +1033,7 @@ class DataChain:
                 partition_by=processed_partition_by,
                 **self._settings.to_dict(),
             ),
-            signal_schema=udf_obj.output,
+            signal_schema=SignalSchema({"sys": Sys}) | udf_obj.output,
         )
     def batch_map(
@@ -1097,11 +1099,7 @@ class DataChain:
         sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
         DataModel.register(list(sign.output_schema.values.values()))
-        signals_schema = self.signals_schema
-        if self._sys:
-            signals_schema = SignalSchema({"sys": Sys}) | signals_schema
-        params_schema = signals_schema.slice(
+        params_schema = self.signals_schema.slice(
             sign.params, self._setup, is_batch=is_batch
         )
@@ -1156,11 +1154,9 @@ class DataChain:
             )
         )
-    def select(self, *args: str, _sys: bool = True) -> "Self":
+    def select(self, *args: str) -> "Self":
         """Select only a specified set of signals."""
         new_schema = self.signals_schema.resolve(*args)
-        if self._sys and _sys:
-            new_schema = SignalSchema({"sys": Sys}) | new_schema
         columns = new_schema.db_signals()
         return self._evolve(
             query=self._query.select(*columns), signal_schema=new_schema
@@ -1710,9 +1706,11 @@ class DataChain:
         signals_schema = self.signals_schema.clone_without_sys_signals()
         right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
-        ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
-            right_signals_schema, rname
-        )
+        ds.signals_schema = signals_schema.merge(right_signals_schema, rname)
+        if not full:
+            ds.signals_schema = SignalSchema({"sys": Sys}) | ds.signals_schema
         return ds
@@ -1723,6 +1721,7 @@ class DataChain:
         Parameters:
             other: chain whose rows will be added to `self`.
         """
+        self.signals_schema = self.signals_schema.clone_without_sys_signals()
         return self._evolve(query=self._query.union(other._query))
     def subtract(  # type: ignore[override]

datachain/query/dataset.py CHANGED Viewed

@@ -438,9 +438,6 @@ class UDFStep(Step, ABC):
         """
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
-        if "sys__id" not in query.selected_columns:
-            raise RuntimeError("Query must have sys__id column to run UDF")
         if (rows_total := self.catalog.warehouse.query_count(query)) == 0:
             return
@@ -634,12 +631,11 @@ class UDFStep(Step, ABC):
         # Apply partitioning if needed.
         if self.partition_by is not None:
-            if "sys__id" not in query.selected_columns:
-                _query = query = self.catalog.warehouse._regenerate_system_columns(
-                    query,
-                    keep_existing_columns=True,
-                )
+            _query = query = self.catalog.warehouse._regenerate_system_columns(
+                query_generator.select(),
+                keep_existing_columns=True,
+                regenerate_columns=["sys__id"],
+            )
             partition_tbl = self.create_partitions_table(query)
             temp_tables.append(partition_tbl.name)
             query = query.outerjoin(
@@ -960,28 +956,23 @@ class SQLUnion(Step):
         q2 = self.query2.apply_steps().select().subquery()
         temp_tables.extend(self.query2.temp_table_names)
-        columns1, columns2 = _order_columns(q1.columns, q2.columns)
-        union_select = sqlalchemy.select(*columns1).union_all(
-            sqlalchemy.select(*columns2)
-        )
-        union_cte = union_select.cte()
-        regenerated = self.query1.catalog.warehouse._regenerate_system_columns(
-            union_cte
-        )
-        result_columns = tuple(regenerated.selected_columns)
+        columns1 = _drop_system_columns(q1.columns)
+        columns2 = _drop_system_columns(q2.columns)
+        columns1, columns2 = _order_columns(columns1, columns2)
         def q(*columns):
-            if not columns:
-                return regenerated
+            selected_names = [c.name for c in columns]
+            col1 = [c for c in columns1 if c.name in selected_names]
+            col2 = [c for c in columns2 if c.name in selected_names]
+            union_query = sqlalchemy.select(*col1).union_all(sqlalchemy.select(*col2))
-            names = {c.name for c in columns}
-            selected = [c for c in result_columns if c.name in names]
-            return regenerated.with_only_columns(*selected)
+            union_cte = union_query.cte()
+            select_cols = [union_cte.c[name] for name in selected_names]
+            return sqlalchemy.select(*select_cols)
         return step_result(
             q,
-            result_columns,
+            columns1,
             dependencies=self.query1.dependencies | self.query2.dependencies,
         )
@@ -1070,7 +1061,7 @@ class SQLJoin(Step):
         q1 = self.get_query(self.query1, temp_tables)
         q2 = self.get_query(self.query2, temp_tables)
-        q1_columns = list(q1.c)
+        q1_columns = _drop_system_columns(q1.c) if self.full else list(q1.c)
         q1_column_names = {c.name for c in q1_columns}
         q2_columns = []
@@ -1211,6 +1202,10 @@ def _order_columns(
     return [[d[n] for n in column_order] for d in column_dicts]
+def _drop_system_columns(columns: Iterable[ColumnElement]) -> list[ColumnElement]:
+    return [c for c in columns if not c.name.startswith("sys__")]
 @attrs.define
 class ResultIter:
     _row_iter: Iterable[Any]

datachain/query/dispatch.py CHANGED Viewed

@@ -2,12 +2,16 @@ import contextlib
 from collections.abc import Iterable, Sequence
 from itertools import chain
 from multiprocessing import cpu_count
+from queue import Empty
 from sys import stdin
+from time import monotonic, sleep
 from typing import TYPE_CHECKING, Literal
+import multiprocess
 from cloudpickle import load, loads
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
-from multiprocess import get_context
+from multiprocess.context import Process
+from multiprocess.queues import Queue as MultiprocessQueue
 from datachain.catalog import Catalog
 from datachain.catalog.catalog import clone_catalog_with_cache
@@ -25,7 +29,6 @@ from datachain.query.udf import UdfInfo
 from datachain.utils import batched, flatten, safe_closing
 if TYPE_CHECKING:
-    import multiprocess
     from sqlalchemy import Select, Table
     from datachain.data_storage import AbstractMetastore, AbstractWarehouse
@@ -101,8 +104,8 @@ def udf_worker_entrypoint(fd: int | None = None) -> int:
 class UDFDispatcher:
     _catalog: Catalog | None = None
-    task_queue: "multiprocess.Queue | None" = None
-    done_queue: "multiprocess.Queue | None" = None
+    task_queue: MultiprocessQueue | None = None
+    done_queue: MultiprocessQueue | None = None
     def __init__(self, udf_info: UdfInfo, buffer_size: int = DEFAULT_BATCH_SIZE):
         self.udf_data = udf_info["udf_data"]
@@ -121,7 +124,7 @@ class UDFDispatcher:
         self.buffer_size = buffer_size
         self.task_queue = None
         self.done_queue = None
-        self.ctx = get_context("spawn")
+        self.ctx = multiprocess.get_context("spawn")
     @property
     def catalog(self) -> "Catalog":
@@ -259,8 +262,6 @@ class UDFDispatcher:
         for p in pool:
             p.start()
-        # Will be set to True if all tasks complete normally
-        normal_completion = False
         try:
             # Will be set to True when the input is exhausted
             input_finished = False
@@ -283,10 +284,20 @@ class UDFDispatcher:
             # Process all tasks
             while n_workers > 0:
-                try:
-                    result = get_from_queue(self.done_queue)
-                except KeyboardInterrupt:
-                    break
+                while True:
+                    try:
+                        result = self.done_queue.get_nowait()
+                        break
+                    except Empty:
+                        for p in pool:
+                            exitcode = p.exitcode
+                            if exitcode not in (None, 0):
+                                message = (
+                                    f"Worker {p.name} exited unexpectedly with "
+                                    f"code {exitcode}"
+                                )
+                                raise RuntimeError(message) from None
+                        sleep(0.01)
                 if bytes_downloaded := result.get("bytes_downloaded"):
                     download_cb.relative_update(bytes_downloaded)
@@ -313,39 +324,50 @@ class UDFDispatcher:
                         put_into_queue(self.task_queue, next(input_data))
                     except StopIteration:
                         input_finished = True
-            # Finished with all tasks normally
-            normal_completion = True
         finally:
-            if not normal_completion:
-                # Stop all workers if there is an unexpected exception
-                for _ in pool:
-                    put_into_queue(self.task_queue, STOP_SIGNAL)
-                # This allows workers (and this process) to exit without
-                # consuming any remaining data in the queues.
-                # (If they exit due to an exception.)
-                self.task_queue.close()
-                self.task_queue.join_thread()
-                # Flush all items from the done queue.
-                # This is needed if any workers are still running.
-                while n_workers > 0:
-                    result = get_from_queue(self.done_queue)
-                    status = result["status"]
-                    if status != OK_STATUS:
-                        n_workers -= 1
-                self.done_queue.close()
-                self.done_queue.join_thread()
+            self._shutdown_workers(pool)
+    def _shutdown_workers(self, pool: list[Process]) -> None:
+        self._terminate_pool(pool)
+        self._drain_queue(self.done_queue)
+        self._drain_queue(self.task_queue)
+        self._close_queue(self.done_queue)
+        self._close_queue(self.task_queue)
+    def _terminate_pool(self, pool: list[Process]) -> None:
+        for proc in pool:
+            if proc.is_alive():
+                proc.terminate()
+        deadline = monotonic() + 1.0
+        for proc in pool:
+            if not proc.is_alive():
+                continue
+            remaining = deadline - monotonic()
+            if remaining > 0:
+                proc.join(remaining)
+            if proc.is_alive():
+                proc.kill()
+                proc.join(timeout=0.2)
+    def _drain_queue(self, queue: MultiprocessQueue) -> None:
+        while True:
+            try:
+                queue.get_nowait()
+            except Empty:
+                return
+            except (OSError, ValueError):
+                return
-            # Wait for workers to stop
-            for p in pool:
-                p.join()
+    def _close_queue(self, queue: MultiprocessQueue) -> None:
+        with contextlib.suppress(OSError, ValueError):
+            queue.close()
+        with contextlib.suppress(RuntimeError, AssertionError, ValueError):
+            queue.join_thread()
 class DownloadCallback(Callback):
-    def __init__(self, queue: "multiprocess.Queue") -> None:
+    def __init__(self, queue: MultiprocessQueue) -> None:
         self.queue = queue
         super().__init__()
@@ -360,7 +382,7 @@ class ProcessedCallback(Callback):
     def __init__(
         self,
         name: Literal["processed", "generated"],
-        queue: "multiprocess.Queue",
+        queue: MultiprocessQueue,
     ) -> None:
         self.name = name
         self.queue = queue
@@ -375,8 +397,8 @@ class UDFWorker:
         self,
         catalog: "Catalog",
         udf: "UDFAdapter",
-        task_queue: "multiprocess.Queue",
-        done_queue: "multiprocess.Queue",
+        task_queue: MultiprocessQueue,
+        done_queue: MultiprocessQueue,
         query: "Select",
         table: "Table",
         cache: bool,

datachain/query/queue.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import datetime
 from collections.abc import Iterable, Iterator
-from queue import Empty, Full, Queue
+from queue import Empty, Full
 from struct import pack, unpack
 from time import sleep
 from typing import Any
 import msgpack
+from multiprocess.queues import Queue
 from datachain.query.batch import RowsOutput

{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.36.0
+Version: 0.36.1
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0
@@ -64,7 +64,6 @@ Requires-Dist: torch>=2.1.0; extra == "torch"
 Requires-Dist: torchvision; extra == "torch"
 Requires-Dist: transformers>=4.36.0; extra == "torch"
 Provides-Extra: audio
-Requires-Dist: torchaudio; extra == "audio"
 Requires-Dist: soundfile; extra == "audio"
 Provides-Extra: remote
 Requires-Dist: lz4; extra == "remote"
@@ -76,6 +75,7 @@ Requires-Dist: numba>=0.60.0; extra == "hf"
 Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
 Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
 Requires-Dist: fsspec>=2024.12.0; extra == "hf"
+Requires-Dist: torch<2.9.0; extra == "hf"
 Provides-Extra: video
 Requires-Dist: ffmpeg-python; extra == "video"
 Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
@@ -117,6 +117,7 @@ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
 Requires-Dist: ultralytics; extra == "examples"
 Requires-Dist: open_clip_torch; extra == "examples"
 Requires-Dist: openai; extra == "examples"
+Requires-Dist: torchaudio<2.9.0; extra == "examples"
 Dynamic: license-file
 ================

{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/RECORD RENAMED Viewed

@@ -55,11 +55,11 @@ datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWT
 datachain/data_storage/db_engine.py,sha256=MGbrckXk5kHOfpjnhHhGpyJpAsgaBCxMmfd33hB2SWI,3756
 datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
 datachain/data_storage/metastore.py,sha256=NLGYLErWFUNXjKbEoESFkKW222MQdMCBlpuqaYVugsE,63484
-datachain/data_storage/schema.py,sha256=4FZZFgPTI9e3gUFdlm1smPdES7FHctwXQNdNfY69tj8,9807
+datachain/data_storage/schema.py,sha256=3fAgiE11TIDYCW7EbTdiOm61SErRitvsLr7YPnUlVm0,9801
 datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
-datachain/data_storage/sqlite.py,sha256=xQZ944neP57K_25HSetIy35IakAcyA0cUKVe-xeIEgQ,31168
-datachain/data_storage/warehouse.py,sha256=rNz2wFlFA-pyBAuy14RL6lRIFhrNEnX02c9SgGs4v58,34994
-datachain/diff/__init__.py,sha256=pixXOnbOcoxfkBvbaiDNGPhJMEyTiHb9EIFxR7QqY5A,9533
+datachain/data_storage/sqlite.py,sha256=MgQ6bfJ7LGW91UiVHQtSkj_5HalRi1aeHCEW__5JEe8,30959
+datachain/data_storage/warehouse.py,sha256=nuGT27visvAi7jr7ZAZF-wmFe0ZEFD8qaTheINX_7RM,35269
+datachain/diff/__init__.py,sha256=Fo3xMnctKyA0YtvnsBXQ-P5gQeeEwed17Tn_i7vfLKs,9332
 datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
 datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
@@ -76,7 +76,7 @@ datachain/func/string.py,sha256=kXkPHimtA__EVg_Th1yldGaLJpw4HYVhIeYtKy3DuyQ,7406
 datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/arrow.py,sha256=eCZtqbjAzkL4aemY74f_XkIJ_FWwXugJNjIFOwDa9w0,10815
-datachain/lib/audio.py,sha256=3QWQ7PHuRnen7al8EjgjWuKbRKe4SvrbWELJ1T_Cin0,7545
+datachain/lib/audio.py,sha256=hHG29vqrV389im152wCjh80d0xqXGGvFnUpUwkzZejQ,7385
 datachain/lib/clip.py,sha256=nF8-N6Uz0MbAsPJBY2iXEYa3DPLo80OOer5SRNAtcGM,6149
 datachain/lib/data_model.py,sha256=H-bagx24-cLlC7ngSP6Dby4mB6kSxxV7KDiHxQjzwlg,3798
 datachain/lib/dataset_info.py,sha256=Ym7yYcGpfUmPLrfdxueijCVRP2Go6KbyuLk_fmzYgDU,3273
@@ -109,7 +109,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=Sxj0ojeMSpAwM_NNoXa1dMR_2L_cQ6X
 datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
 datachain/lib/dc/csv.py,sha256=fIfj5-2Ix4z5D5yZueagd5WUWw86pusJ9JJKD-U3KGg,4407
 datachain/lib/dc/database.py,sha256=Wqob3dQc9Mol_0vagzVEXzteCKS9M0E3U5130KVmQKg,14629
-datachain/lib/dc/datachain.py,sha256=Q8iEmf0MT6o5ORjyoKAt2xEIelcJ6vzZoB2e7haT7V8,104189
+datachain/lib/dc/datachain.py,sha256=cVqgemBiPVLSnfEVDLU1YH0dtowS-N-YFOAxV1k7i6U,104178
 datachain/lib/dc/datasets.py,sha256=A4SW-b3dkQnm9Wi7ciCdlXqtrsquIeRfBQN_bJ_ulqY,15237
 datachain/lib/dc/hf.py,sha256=FeruEO176L2qQ1Mnx0QmK4kV0GuQ4xtj717N8fGJrBI,2849
 datachain/lib/dc/json.py,sha256=iJ6G0jwTKz8xtfh1eICShnWk_bAMWjF5bFnOXLHaTlw,2683
@@ -132,11 +132,11 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
 datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
 datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
 datachain/query/batch.py,sha256=ugTlSFqh_kxMcG6vJ5XrEzG9jBXRdb7KRAEEsFWiPew,4190
-datachain/query/dataset.py,sha256=lv5Ta7FjFZWQRUTz9_97oeoT5OvD62unRoNLgEueWUU,67384
-datachain/query/dispatch.py,sha256=B0sxnyN6unU8VFc35eWa_pe_TX6JfHDDbzyIQtp8AoM,15665
+datachain/query/dataset.py,sha256=Pu8FC11VcIj8ewXJxe0mjJpr4HBr2-gvCtMk4GQCva0,67419
+datachain/query/dispatch.py,sha256=Tg73zB6vDnYYYAvtlS9l7BI3sI1EfRCbDjiasvNxz2s,16385
 datachain/query/metrics.py,sha256=qOMHiYPTMtVs2zI-mUSy8OPAVwrg4oJtVF85B9tdQyM,810
 datachain/query/params.py,sha256=JkVz6IKUIpF58JZRkUXFT8DAHX2yfaULbhVaGmHKFLc,826
-datachain/query/queue.py,sha256=v0UeK4ilmdiRoJ5OdjB5qpnHTYDxRP4vhVp5Iw_toaI,3512
+datachain/query/queue.py,sha256=kCetMG6y7_ynV_jJDAXkLsf8WsVZCEk1fAuQGd7yTOo,3543
 datachain/query/schema.py,sha256=Cn1keXjktptAbEDbHlxSzdoCu5H6h_Vzp_DtNpMSr5w,6697
 datachain/query/session.py,sha256=lbwMDvxjZ2BS2rA9qk7MVBRzlsSrwH92yJ_waP3uvDc,6781
 datachain/query/udf.py,sha256=SLLLNLz3QmtaM04ZVTu7K6jo58I-1j5Jf7Lb4ORv4tQ,1385
@@ -165,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
 datachain/toolkit/split.py,sha256=xQzzmvQRKsPteDKbpgOxd4r971BnFaK33mcOl0FuGeI,2883
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.36.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.36.0.dist-info/METADATA,sha256=ZH1x0Zcl8YD035rT1qvKm3D_NnSRgGtnD0TP2FNlwgI,13606
-datachain-0.36.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-datachain-0.36.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.36.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.36.0.dist-info/RECORD,,
+datachain-0.36.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.36.1.dist-info/METADATA,sha256=BBaBx1Ail7RzpUlvEywlXKZtl_6Vn-KIEjm8OJdXrng,13657
+datachain-0.36.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+datachain-0.36.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.36.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.36.1.dist-info/RECORD,,

{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{datachain-0.36.0.dist-info → datachain-0.36.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.36.0__py3-none-any.whl → 0.36.1__py3-none-any.whl

Potentially problematic release.

datachain 0.36.0py3-none-any.whl → 0.36.1py3-none-any.whl