PyPI - datachain - Versions diffs - 0.7.11__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

datachain 0.7.11py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (15) hide show

datachain/catalog/catalog.py +53 -41
datachain/cli.py +25 -3
datachain/data_storage/sqlite.py +20 -6
datachain/lib/dc.py +155 -109
datachain/lib/diff.py +197 -0
datachain/lib/meta_formats.py +38 -42
datachain/query/dataset.py +1 -0
datachain/remote/studio.py +53 -1
datachain/studio.py +47 -2
{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/METADATA +3 -2
{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/RECORD +15 -14
{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/LICENSE +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/WHEEL +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/entry_points.txt +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import io
 import json
 import logging
-import math
 import os
 import os.path
 import posixpath
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
 from copy import copy
 from dataclasses import dataclass
 from functools import cached_property, reduce
-from random import shuffle
 from threading import Thread
 from typing import (
     IO,
@@ -58,11 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
 from datachain.sql.types import DateTime, SQLType
-from datachain.utils import (
-    DataChainDir,
-    batched,
-    datachain_paths_join,
-)
+from datachain.utils import DataChainDir, datachain_paths_join
 from .datasource import DataSource
@@ -90,7 +84,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
 QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
 # dataset pull
-PULL_DATASET_MAX_THREADS = 10
+PULL_DATASET_MAX_THREADS = 5
 PULL_DATASET_CHUNK_TIMEOUT = 3600
 PULL_DATASET_SLEEP_INTERVAL = 0.1  # sleep time while waiting for chunk to be available
 PULL_DATASET_CHECK_STATUS_INTERVAL = 20  # interval to check export status in Studio
@@ -130,6 +124,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         local_ds_version: int,
         schema: dict[str, Union[SQLType, type[SQLType]]],
         max_threads: int = PULL_DATASET_MAX_THREADS,
+        progress_bar=None,
     ):
         super().__init__(max_threads)
         self._check_dependencies()
@@ -142,6 +137,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         self.schema = schema
         self.last_status_check: Optional[float] = None
         self.studio_client = StudioClient()
+        self.progress_bar = progress_bar
     def done_task(self, done):
         for task in done:
@@ -198,6 +194,20 @@ class DatasetRowsFetcher(NodesThreadPool):
         for c in [c for c, t in self.schema.items() if t == DateTime]:
             df[c] = pd.to_datetime(df[c], unit="s")
+        # id will be autogenerated in DB
+        return df.drop("sys__id", axis=1)
+    def get_parquet_content(self, url: str):
+        while True:
+            if self.should_check_for_status():
+                self.check_for_status()
+            r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
+            if r.status_code == 404:
+                time.sleep(PULL_DATASET_SLEEP_INTERVAL)
+                continue
+            r.raise_for_status()
+            return r.content
     def do_task(self, urls):
         import lz4.frame
         import pandas as pd
@@ -207,31 +217,22 @@ class DatasetRowsFetcher(NodesThreadPool):
             local_ds = metastore.get_dataset(self.local_ds_name)
             urls = list(urls)
-            while urls:
-                for url in urls:
-                    if self.should_check_for_status():
-                        self.check_for_status()
-                    r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
-                    if r.status_code == 404:
-                        time.sleep(PULL_DATASET_SLEEP_INTERVAL)
-                        # moving to the next url
-                        continue
-                    r.raise_for_status()
-                    df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
-                    self.fix_columns(df)
+            for url in urls:
+                if self.should_check_for_status():
+                    self.check_for_status()
-                    # id will be autogenerated in DB
-                    df = df.drop("sys__id", axis=1)
+                df = pd.read_parquet(
+                    io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
+                )
+                df = self.fix_columns(df)
-                    inserted = warehouse.insert_dataset_rows(
-                        df, local_ds, self.local_ds_version
-                    )
-                    self.increase_counter(inserted)  # type: ignore [arg-type]
-                    urls.remove(url)
+                inserted = warehouse.insert_dataset_rows(
+                    df, local_ds, self.local_ds_version
+                )
+                self.increase_counter(inserted)  # type: ignore [arg-type]
+                # sometimes progress bar doesn't get updated so manually updating it
+                self.update_progress_bar(self.progress_bar)
 @dataclass
@@ -1291,13 +1292,13 @@ class Catalog:
         for source in data_sources:  # type: ignore [union-attr]
             yield source, source.ls(fields)
-    def pull_dataset(  # noqa: PLR0915
+    def pull_dataset(  # noqa: C901, PLR0915
         self,
         remote_ds_uri: str,
         output: Optional[str] = None,
         local_ds_name: Optional[str] = None,
         local_ds_version: Optional[int] = None,
-        no_cp: bool = False,
+        cp: bool = False,
         force: bool = False,
         edatachain: bool = False,
         edatachain_file: Optional[str] = None,
@@ -1305,7 +1306,7 @@ class Catalog:
         client_config=None,
     ) -> None:
         def _instantiate(ds_uri: str) -> None:
-            if no_cp:
+            if not cp:
                 return
             assert output
             self.cp(
@@ -1318,7 +1319,7 @@ class Catalog:
             )
             print(f"Dataset {ds_uri} instantiated locally to {output}")
-        if not output and not no_cp:
+        if cp and not output:
             raise ValueError("Please provide output directory for instantiation")
         studio_client = StudioClient()
@@ -1417,12 +1418,26 @@ class Catalog:
         signed_urls = export_response.data
         if signed_urls:
-            shuffle(signed_urls)
             with (
                 self.metastore.clone() as metastore,
                 self.warehouse.clone() as warehouse,
             ):
+                def batch(urls):
+                    """
+                    Batching urls in a way that fetching is most efficient as
+                    urls with lower id will be created first. Because that, we
+                    are making sure all threads are pulling most recent urls
+                    from beginning
+                    """
+                    res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
+                    current_worker = 0
+                    for url in signed_urls:
+                        res[current_worker].append(url)
+                        current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
+                    return res
                 rows_fetcher = DatasetRowsFetcher(
                     metastore,
                     warehouse,
@@ -1431,14 +1446,11 @@ class Catalog:
                     local_ds_name,
                     local_ds_version,
                     schema,
+                    progress_bar=dataset_save_progress_bar,
                 )
                 try:
                     rows_fetcher.run(
-                        batched(
-                            signed_urls,
-                            math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
-                        ),
-                        dataset_save_progress_bar,
+                        iter(batch(signed_urls)), dataset_save_progress_bar
                     )
                 except:
                     self.remove_dataset(local_ds_name, local_ds_version)

datachain/cli.py CHANGED Viewed

@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
         help="Python package requirement. Can be specified multiple times.",
     )
+    studio_cancel_help = "Cancel a job in Studio"
+    studio_cancel_description = "This command cancels a job in Studio."
+    studio_cancel_parser = studio_subparser.add_parser(
+        "cancel",
+        parents=[parent_parser],
+        description=studio_cancel_description,
+        help=studio_cancel_help,
+    )
+    studio_cancel_parser.add_argument(
+        "job_id",
+        action="store",
+        help="The job ID to cancel.",
+    )
+    studio_cancel_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to cancel a job for. By default, it will use team from config.",
+    )
 def get_parser() -> ArgumentParser:  # noqa: PLR0915
     try:
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Copy directories recursively",
     )
     parse_pull.add_argument(
-        "--no-cp",
+        "--cp",
         default=False,
         action="store_true",
-        help="Do not copy files, just pull a remote dataset into local DB",
+        help="Copy actual files after pulling remote dataset into local DB",
     )
     parse_pull.add_argument(
         "--edatachain",
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                     args.output,
                     local_ds_name=args.local_name,
                     local_ds_version=args.local_version,
-                    no_cp=args.no_cp,
+                    cp=args.cp,
                     force=bool(args.force),
                     edatachain=args.edatachain,
                     edatachain_file=args.edatachain_file,

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
     @retry_sqlite_locks
     def executemany(
-        self, query, params, cursor: Optional[sqlite3.Cursor] = None
+        self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
     ) -> sqlite3.Cursor:
         if cursor:
             return cursor.executemany(self.compile(query).string, params)
+        if conn:
+            return conn.executemany(self.compile(query).string, params)
         return self.db.executemany(self.compile(query).string, params)
     @retry_sqlite_locks
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
         return self.db.execute(sql, parameters)
     def insert_dataframe(self, table_name: str, df) -> int:
-        return df.to_sql(table_name, self.db, if_exists="append", index=False)
+        return df.to_sql(
+            table_name,
+            self.db,
+            if_exists="append",
+            index=False,
+            method="multi",
+            chunksize=1000,
+        )
     def cursor(self, factory=None):
         if factory is None:
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
         rows = list(rows)
         if not rows:
             return
-        self.db.executemany(
-            table.insert().values({f: bindparam(f) for f in rows[0]}),
-            rows,
-        )
+        with self.db.transaction() as conn:
+            # transactions speeds up inserts significantly as there is no separate
+            # transaction created for each insert row
+            self.db.executemany(
+                table.insert().values({f: bindparam(f) for f in rows[0]}),
+                rows,
+                conn=conn,
+            )
     def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
         dr = self.dataset_rows(dataset, version)

datachain/lib/dc.py CHANGED Viewed

@@ -41,7 +41,7 @@ from datachain.lib.listing import (
     parse_listing_uri,
 )
 from datachain.lib.listing_info import ListingInfo
-from datachain.lib.meta_formats import read_meta, read_schema
+from datachain.lib.meta_formats import read_meta
 from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
@@ -554,8 +554,7 @@ class DataChain:
         jmespath: Optional[str] = None,
         object_name: Optional[str] = "",
         model_name: Optional[str] = None,
-        print_schema: Optional[bool] = False,
-        meta_type: Optional[str] = "json",
+        format: Optional[str] = "json",
         nrows=None,
         **kwargs,
     ) -> "DataChain":
@@ -564,12 +563,12 @@ class DataChain:
         Parameters:
             path : storage URI with directory. URI must start with storage prefix such
                 as `s3://`, `gs://`, `az://` or "file:///"
-            type : read file as "binary", "text", or "image" data. Default is "binary".
+            type : read file as "binary", "text", or "image" data. Default is "text".
             spec : optional Data Model
             schema_from : path to sample to infer spec (if schema not provided)
             object_name : generated object column name
             model_name : optional generated model name
-            print_schema : print auto-generated schema
+            format: "json", "jsonl"
             jmespath : optional JMESPATH expression to reduce JSON
             nrows : optional row limit for jsonl and JSON arrays
@@ -594,75 +593,14 @@ class DataChain:
         if (not object_name) and jmespath:
             object_name = jmespath_to_name(jmespath)
         if not object_name:
-            object_name = meta_type
-        chain = DataChain.from_storage(uri=path, type=type, **kwargs)
-        signal_dict = {
-            object_name: read_meta(
-                schema_from=schema_from,
-                meta_type=meta_type,
-                spec=spec,
-                model_name=model_name,
-                print_schema=print_schema,
-                jmespath=jmespath,
-                nrows=nrows,
-            )
-        }
-        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
-    @classmethod
-    def from_jsonl(
-        cls,
-        path,
-        type: Literal["binary", "text", "image"] = "text",
-        spec: Optional[DataType] = None,
-        schema_from: Optional[str] = "auto",
-        jmespath: Optional[str] = None,
-        object_name: Optional[str] = "",
-        model_name: Optional[str] = None,
-        print_schema: Optional[bool] = False,
-        meta_type: Optional[str] = "jsonl",
-        nrows=None,
-        **kwargs,
-    ) -> "DataChain":
-        """Get data from JSON lines. It returns the chain itself.
-        Parameters:
-            path : storage URI with directory. URI must start with storage prefix such
-                as `s3://`, `gs://`, `az://` or "file:///"
-            type : read file as "binary", "text", or "image" data. Default is "binary".
-            spec : optional Data Model
-            schema_from : path to sample to infer spec (if schema not provided)
-            object_name : generated object column name
-            model_name : optional generated model name
-            print_schema : print auto-generated schema
-            jmespath : optional JMESPATH expression to reduce JSON
-            nrows : optional row limit for jsonl and JSON arrays
-        Example:
-            infer JSONl schema from data, limit parsing to 1 row
-            ```py
-            chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
-            ```
-        """
-        if schema_from == "auto":
-            schema_from = path
-        def jmespath_to_name(s: str):
-            name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]
-            return s[:name_end]
-        if (not object_name) and jmespath:
-            object_name = jmespath_to_name(jmespath)
-        if not object_name:
-            object_name = meta_type
+            object_name = format
         chain = DataChain.from_storage(uri=path, type=type, **kwargs)
         signal_dict = {
             object_name: read_meta(
                 schema_from=schema_from,
-                meta_type=meta_type,
+                format=format,
                 spec=spec,
                 model_name=model_name,
-                print_schema=print_schema,
                 jmespath=jmespath,
                 nrows=nrows,
             )
@@ -793,47 +731,6 @@ class DataChain:
             **{object_name: catalog.listings()},  # type: ignore[arg-type]
         )
-    def print_json_schema(  # type: ignore[override]
-        self, jmespath: Optional[str] = None, model_name: Optional[str] = None
-    ) -> "Self":
-        """Print JSON data model and save it. It returns the chain itself.
-        Parameters:
-            jmespath : JMESPATH expression to reduce JSON
-            model_name : generated model name
-        Example:
-            print JSON schema and save to column "meta_from":
-            ```py
-            uri = "gs://datachain-demo/coco2017/annotations_captions/"
-            chain = DataChain.from_storage(uri)
-            chain = chain.print_json_schema()
-            chain.save()
-            ```
-        """
-        return self.map(
-            meta_schema=lambda file: read_schema(
-                file, data_type="json", expr=jmespath, model_name=model_name
-            ),
-            output=str,
-        )
-    def print_jsonl_schema(  # type: ignore[override]
-        self, jmespath: Optional[str] = None, model_name: Optional[str] = None
-    ) -> "Self":
-        """Print JSON data model and save it. It returns the chain itself.
-        Parameters:
-            jmespath : JMESPATH expression to reduce JSON
-            model_name : generated model name
-        """
-        return self.map(
-            meta_schema=lambda file: read_schema(
-                file, data_type="jsonl", expr=jmespath, model_name=model_name
-            ),
-            output=str,
-        )
     def save(  # type: ignore[override]
         self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
     ) -> "Self":
@@ -1624,6 +1521,155 @@ class DataChain:
             )
         return self._evolve(query=self._query.subtract(other._query, signals))  # type: ignore[arg-type]
+    def compare(
+        self,
+        other: "DataChain",
+        on: Union[str, Sequence[str]],
+        right_on: Optional[Union[str, Sequence[str]]] = None,
+        compare: Optional[Union[str, Sequence[str]]] = None,
+        right_compare: Optional[Union[str, Sequence[str]]] = None,
+        added: bool = True,
+        deleted: bool = True,
+        modified: bool = True,
+        same: bool = False,
+        status_col: Optional[str] = None,
+    ) -> "DataChain":
+        """Comparing two chains by identifying rows that are added, deleted, modified
+        or same. Result is the new chain that has additional column with possible
+        values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
+        rows respectively. Note that if only one "status" is asked, by setting proper
+        flags, this additional column is not created as it would have only one value
+        for all rows. Beside additional diff column, new chain has schema of the chain
+        on which method was called.
+        Parameters:
+            other: Chain to calculate diff from.
+            on: Column or list of columns to match on. If both chains have the
+                same columns then this column is enough for the match. Otherwise,
+                `right_on` parameter has to specify the columns for the other chain.
+                This value is used to find corresponding row in other dataset. If not
+                found there, row is considered as added (or removed if vice versa), and
+                if found then row can be either modified or same.
+            right_on: Optional column or list of columns
+                for the `other` to match.
+            compare: Column or list of columns to compare on. If both chains have
+                the same columns then this column is enough for the compare. Otherwise,
+                `right_compare` parameter has to specify the columns for the other
+                chain. This value is used to see if row is modified or same. If
+                not set, all columns will be used for comparison
+            right_compare: Optional column or list of columns
+                    for the `other` to compare to.
+            added (bool): Whether to return added rows in resulting chain.
+            deleted (bool): Whether to return deleted rows in resulting chain.
+            modified (bool): Whether to return modified rows in resulting chain.
+            same (bool): Whether to return unchanged rows in resulting chain.
+            status_col (str): Name of the new column that is created in resulting chain
+                representing diff status.
+        Example:
+            ```py
+            diff = persons.diff(
+                new_persons,
+                on=["id"],
+                right_on=["other_id"],
+                compare=["name"],
+                added=True,
+                deleted=True,
+                modified=True,
+                same=True,
+                status_col="diff"
+            )
+            ```
+        """
+        from datachain.lib.diff import compare as chain_compare
+        return chain_compare(
+            self,
+            other,
+            on,
+            right_on=right_on,
+            compare=compare,
+            right_compare=right_compare,
+            added=added,
+            deleted=deleted,
+            modified=modified,
+            same=same,
+            status_col=status_col,
+        )
+    def diff(
+        self,
+        other: "DataChain",
+        on: str = "file",
+        right_on: Optional[str] = None,
+        added: bool = True,
+        modified: bool = True,
+        deleted: bool = False,
+        same: bool = False,
+        status_col: Optional[str] = None,
+    ) -> "DataChain":
+        """Similar to `.compare()`, which is more generic method to calculate difference
+        between two chains. Unlike `.compare()`, this method works only on those chains
+        that have `File` object, or it's derivatives, in it. File `source` and `path`
+        are used for matching, and file `version` and `etag` for comparing, while in
+        `.compare()` user needs to provide arbitrary columns for matching and comparing.
+        Parameters:
+            other: Chain to calculate diff from.
+            on: File signal to match on. If both chains have the
+                same file signal then this column is enough for the match. Otherwise,
+                `right_on` parameter has to specify the file signal for the other chain.
+                This value is used to find corresponding row in other dataset. If not
+                found there, row is considered as added (or removed if vice versa), and
+                if found then row can be either modified or same.
+            right_on: Optional file signal for the `other` to match.
+            added (bool): Whether to return added rows in resulting chain.
+            deleted (bool): Whether to return deleted rows in resulting chain.
+            modified (bool): Whether to return modified rows in resulting chain.
+            same (bool): Whether to return unchanged rows in resulting chain.
+            status_col (str): Optional name of the new column that is created in
+                resulting chain representing diff status.
+        Example:
+            ```py
+            diff = images.diff(
+                new_images,
+                on="file",
+                right_on="other_file",
+                added=True,
+                deleted=True,
+                modified=True,
+                same=True,
+                status_col="diff"
+            )
+            ```
+        """
+        on_file_signals = ["source", "path"]
+        compare_file_signals = ["version", "etag"]
+        def get_file_signals(file: str, signals):
+            return [f"{file}.{c}" for c in signals]
+        right_on = right_on or on
+        on_cols = get_file_signals(on, on_file_signals)
+        right_on_cols = get_file_signals(right_on, on_file_signals)
+        compare_cols = get_file_signals(on, compare_file_signals)
+        right_compare_cols = get_file_signals(right_on, compare_file_signals)
+        return self.compare(
+            other,
+            on_cols,
+            right_on=right_on_cols,
+            compare=compare_cols,
+            right_compare=right_compare_cols,
+            added=added,
+            deleted=deleted,
+            modified=modified,
+            same=same,
+            status_col=status_col,
+        )
     @classmethod
     def from_values(
         cls,

datachain/lib/diff.py ADDED Viewed

@@ -0,0 +1,197 @@
+import random
+import string
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional, Union
+import sqlalchemy as sa
+from datachain.lib.signal_schema import SignalSchema
+from datachain.query.schema import Column
+from datachain.sql.types import String
+if TYPE_CHECKING:
+    from datachain.lib.dc import DataChain
+C = Column
+def compare(  # noqa: PLR0912, PLR0915, C901
+    left: "DataChain",
+    right: "DataChain",
+    on: Union[str, Sequence[str]],
+    right_on: Optional[Union[str, Sequence[str]]] = None,
+    compare: Optional[Union[str, Sequence[str]]] = None,
+    right_compare: Optional[Union[str, Sequence[str]]] = None,
+    added: bool = True,
+    deleted: bool = True,
+    modified: bool = True,
+    same: bool = True,
+    status_col: Optional[str] = None,
+) -> "DataChain":
+    """Comparing two chains by identifying rows that are added, deleted, modified
+    or same"""
+    dialect = left._query.dialect
+    rname = "right_"
+    def _rprefix(c: str, rc: str) -> str:
+        """Returns prefix of right of two companion left - right columns
+        from merge. If companion columns have the same name then prefix will
+        be present in right column name, otherwise it won't.
+        """
+        return rname if c == rc else ""
+    def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
+        return [obj] if isinstance(obj, str) else list(obj)
+    if on is None:
+        raise ValueError("'on' must be specified")
+    on = _to_list(on)
+    if right_on:
+        right_on = _to_list(right_on)
+        if len(on) != len(right_on):
+            raise ValueError("'on' and 'right_on' must be have the same length")
+    if compare:
+        compare = _to_list(compare)
+    if right_compare:
+        if not compare:
+            raise ValueError("'compare' must be defined if 'right_compare' is defined")
+        right_compare = _to_list(right_compare)
+        if len(compare) != len(right_compare):
+            raise ValueError(
+                "'compare' and 'right_compare' must be have the same length"
+            )
+    if not any([added, deleted, modified, same]):
+        raise ValueError(
+            "At least one of added, deleted, modified, same flags must be set"
+        )
+    # we still need status column for internal implementation even if not
+    # needed in output
+    need_status_col = bool(status_col)
+    status_col = status_col or "diff_" + "".join(
+        random.choice(string.ascii_letters)  # noqa: S311
+        for _ in range(10)
+    )
+    # calculate on and compare column names
+    right_on = right_on or on
+    cols = left.signals_schema.clone_without_sys_signals().db_signals()
+    right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
+    on = left.signals_schema.resolve(*on).db_signals()  # type: ignore[assignment]
+    right_on = right.signals_schema.resolve(*right_on).db_signals()  # type: ignore[assignment]
+    if compare:
+        right_compare = right_compare or compare
+        compare = left.signals_schema.resolve(*compare).db_signals()  # type: ignore[assignment]
+        right_compare = right.signals_schema.resolve(*right_compare).db_signals()  # type: ignore[assignment]
+    elif not compare and len(cols) != len(right_cols):
+        # here we will mark all rows that are not added or deleted as modified since
+        # there was no explicit list of compare columns provided (meaning we need
+        # to check all columns to determine if row is modified or same), but
+        # the number of columns on left and right is not the same (one of the chains
+        # have additional column)
+        compare = None
+        right_compare = None
+    else:
+        compare = [c for c in cols if c in right_cols]  # type: ignore[misc, assignment]
+        right_compare = compare
+    diff_cond = []
+    if added:
+        added_cond = sa.and_(
+            *[
+                C(c) == None  # noqa: E711
+                for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
+            ]
+        )
+        diff_cond.append((added_cond, "A"))
+    if modified and compare:
+        modified_cond = sa.or_(
+            *[
+                C(c) != C(f"{_rprefix(c, rc)}{rc}")
+                for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
+            ]
+        )
+        diff_cond.append((modified_cond, "M"))
+    if same and compare:
+        same_cond = sa.and_(
+            *[
+                C(c) == C(f"{_rprefix(c, rc)}{rc}")
+                for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
+            ]
+        )
+        diff_cond.append((same_cond, "S"))
+    diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
+    diff.type = String()
+    left_right_merge = left.merge(
+        right, on=on, right_on=right_on, inner=False, rname=rname
+    )
+    left_right_merge_select = left_right_merge._query.select(
+        *(
+            [C(c) for c in left_right_merge.signals_schema.db_signals("sys")]
+            + [C(c) for c in on]
+            + [C(c) for c in cols if c not in on]
+            + [diff]
+        )
+    )
+    diff_col = sa.literal("D").label(status_col)
+    diff_col.type = String()
+    right_left_merge = right.merge(
+        left, on=right_on, right_on=on, inner=False, rname=rname
+    ).filter(
+        sa.and_(
+            *[C(f"{_rprefix(c, rc)}{c}") == None for c, rc in zip(on, right_on)]  # noqa: E711
+        )
+    )
+    def _default_val(chain: "DataChain", col: str):
+        col_type = chain._query.column_types[col]  # type: ignore[index]
+        val = sa.literal(col_type.default_value(dialect)).label(col)
+        val.type = col_type()
+        return val
+    right_left_merge_select = right_left_merge._query.select(
+        *(
+            [C(c) for c in right_left_merge.signals_schema.db_signals("sys")]
+            + [
+                C(c) if c == rc else _default_val(left, c)
+                for c, rc in zip(on, right_on)
+            ]
+            + [
+                C(c) if c in right_cols else _default_val(left, c)  # type: ignore[arg-type]
+                for c in cols
+                if c not in on
+            ]
+            + [diff_col]
+        )
+    )
+    if not deleted:
+        res = left_right_merge_select
+    elif deleted and not any([added, modified, same]):
+        res = right_left_merge_select
+    else:
+        res = left_right_merge_select.union(right_left_merge_select)
+    res = res.filter(C(status_col) != None)  # noqa: E711
+    schema = left.signals_schema
+    if need_status_col:
+        res = res.select()
+        schema = SignalSchema({status_col: str}) | schema
+    else:
+        res = res.select_except(C(status_col))
+    return left._evolve(query=res, signal_schema=schema)

datachain/lib/meta_formats.py CHANGED Viewed

@@ -38,38 +38,41 @@ def process_json(data_string, jmespath):
     return json_dict
-# Print a dynamic datamodel-codegen output from JSON or CSV on stdout
-def read_schema(source_file, data_type="csv", expr=None, model_name=None):
+def gen_datamodel_code(
+    source_file, format="json", jmespath=None, model_name=None
+) -> str:
+    """Generates Python code with Pydantic models that corresponds
+    to the provided JSON, CSV, or JSONL file.
+    It support root JSON arrays (samples the first entry).
+    """
     data_string = ""
     # using uiid to get around issue #1617
     if not model_name:
         # comply with Python class names
         uid_str = str(generate_uuid()).replace("-", "")
-        model_name = f"Model{data_type}{uid_str}"
-    try:
-        with source_file.open() as fd:  # CSV can be larger than memory
-            if data_type == "csv":
-                data_string += fd.readline().replace("\r", "")
-                data_string += fd.readline().replace("\r", "")
-            elif data_type == "jsonl":
-                data_string = fd.readline().replace("\r", "")
-            else:
-                data_string = fd.read()  # other meta must fit into RAM
-    except OSError as e:
-        print(f"An unexpected file error occurred: {e}")
-        return
-    if data_type in ("json", "jsonl"):
-        json_object = process_json(data_string, expr)
-        if data_type == "json" and isinstance(json_object, list):
+        model_name = f"Model{format}{uid_str}"
+    with source_file.open() as fd:  # CSV can be larger than memory
+        if format == "csv":
+            data_string += fd.readline().replace("\r", "")
+            data_string += fd.readline().replace("\r", "")
+        elif format == "jsonl":
+            data_string = fd.readline().replace("\r", "")
+        else:
+            data_string = fd.read()  # other meta must fit into RAM
+    if format in ("json", "jsonl"):
+        json_object = process_json(data_string, jmespath)
+        if format == "json" and isinstance(json_object, list):
             json_object = json_object[0]  # sample the 1st object from JSON array
-        if data_type == "jsonl":
-            data_type = "json"  # treat json line as plain JSON in auto-schema
+        if format == "jsonl":
+            format = "json"  # treat json line as plain JSON in auto-schema
         data_string = json.dumps(json_object)
     import datamodel_code_generator
     input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
-    input_file_type = input_file_types[data_type]
+    input_file_type = input_file_types[format]
     with tempfile.TemporaryDirectory() as tmpdir:
         output = Path(tmpdir) / "model.py"
         datamodel_code_generator.generate(
@@ -95,36 +98,29 @@ spec = {model_name}
 def read_meta(  # noqa: C901
     spec=None,
     schema_from=None,
-    meta_type="json",
+    format="json",
     jmespath=None,
-    print_schema=False,
     model_name=None,
     nrows=None,
 ) -> Callable:
     from datachain.lib.dc import DataChain
     if schema_from:
-        chain = (
-            DataChain.from_storage(schema_from, type="text")
-            .limit(1)
-            .map(  # dummy column created (#1615)
-                meta_schema=lambda file: read_schema(
-                    file, data_type=meta_type, expr=jmespath, model_name=model_name
-                ),
-                output=str,
-            )
+        file = next(
+            DataChain.from_storage(schema_from, type="text").limit(1).collect("file")
         )
-        (model_output,) = chain.collect("meta_schema")
-        assert isinstance(model_output, str)
-        if print_schema:
-            print(f"{model_output}")
+        model_code = gen_datamodel_code(
+            file, format=format, jmespath=jmespath, model_name=model_name
+        )
+        assert isinstance(model_code, str)
         # Below 'spec' should be a dynamically converted DataModel from Pydantic
         if not spec:
             gl = globals()
-            exec(model_output, gl)  # type: ignore[arg-type] # noqa: S102
+            exec(model_code, gl)  # type: ignore[arg-type] # noqa: S102
             spec = gl["spec"]
-    if not (spec) and not (schema_from):
+    if not spec and not schema_from:
         raise ValueError(
             "Must provide a static schema in spec: or metadata sample in schema_from:"
         )
@@ -136,7 +132,7 @@ def read_meta(  # noqa: C901
     def parse_data(
         file: File,
         data_model=spec,
-        meta_type=meta_type,
+        format=format,
         jmespath=jmespath,
         nrows=nrows,
     ) -> Iterator[spec]:
@@ -148,7 +144,7 @@ def read_meta(  # noqa: C901
             except ValidationError as e:
                 print(f"Validation error occurred in row {nrow} file {file.name}:", e)
-        if meta_type == "csv":
+        if format == "csv":
             with (
                 file.open() as fd
             ):  # TODO: if schema is statically given, should allow CSV without headers
@@ -156,7 +152,7 @@ def read_meta(  # noqa: C901
                 for row in reader:  # CSV can be larger than memory
                     yield from validator(row)
-        if meta_type == "json":
+        if format == "json":
             try:
                 with file.open() as fd:  # JSON must fit into RAM
                     data_string = fd.read()
@@ -174,7 +170,7 @@ def read_meta(  # noqa: C901
                         return
                     yield from validator(json_dict, nrow)
-        if meta_type == "jsonl":
+        if format == "jsonl":
             try:
                 nrow = 0
                 with file.open() as fd:

datachain/query/dataset.py CHANGED Viewed

@@ -1069,6 +1069,7 @@ class DatasetQuery:
         if "sys__id" in self.column_types:
             self.column_types.pop("sys__id")
         self.starting_step = QueryStep(self.catalog, name, self.version)
+        self.dialect = self.catalog.warehouse.db.dialect
     def __iter__(self):
         return iter(self.db_results())

datachain/remote/studio.py CHANGED Viewed

@@ -2,7 +2,7 @@ import base64
 import json
 import logging
 import os
-from collections.abc import Iterable, Iterator
+from collections.abc import AsyncIterator, Iterable, Iterator
 from datetime import datetime, timedelta, timezone
 from struct import unpack
 from typing import (
@@ -11,6 +11,9 @@ from typing import (
     Optional,
     TypeVar,
 )
+from urllib.parse import urlparse, urlunparse
+import websockets
 from datachain.config import Config
 from datachain.dataset import DatasetStats
@@ -22,6 +25,7 @@ LsData = Optional[list[dict[str, Any]]]
 DatasetInfoData = Optional[dict[str, Any]]
 DatasetStatsData = Optional[DatasetStats]
 DatasetRowsData = Optional[Iterable[dict[str, Any]]]
+DatasetJobVersionsData = Optional[dict[str, Any]]
 DatasetExportStatus = Optional[dict[str, Any]]
 DatasetExportSignedUrls = Optional[list[str]]
 FileUploadData = Optional[dict[str, Any]]
@@ -231,6 +235,40 @@ class StudioClient:
         return msgpack.ExtType(code, data)
+    async def tail_job_logs(self, job_id: str) -> AsyncIterator[dict]:
+        """
+        Follow job logs via websocket connection.
+        Args:
+            job_id: ID of the job to follow logs for
+        Yields:
+            Dict containing either job status updates or log messages
+        """
+        parsed_url = urlparse(self.url)
+        ws_url = urlunparse(
+            parsed_url._replace(scheme="wss" if parsed_url.scheme == "https" else "ws")
+        )
+        ws_url = f"{ws_url}/logs/follow/?job_id={job_id}&team_name={self.team}"
+        async with websockets.connect(
+            ws_url,
+            additional_headers={"Authorization": f"token {self.token}"},
+        ) as websocket:
+            while True:
+                try:
+                    message = await websocket.recv()
+                    data = json.loads(message)
+                    # Yield the parsed message data
+                    yield data
+                except websockets.exceptions.ConnectionClosed:
+                    break
+                except Exception as e:  # noqa: BLE001
+                    logger.error("Error receiving websocket message: %s", e)
+                    break
     def ls(self, paths: Iterable[str]) -> Iterator[tuple[str, Response[LsData]]]:
         # TODO: change LsData (response.data value) to be list of lists
         # to handle cases where a path will be expanded (i.e. globs)
@@ -302,6 +340,13 @@ class StudioClient:
             method="GET",
         )
+    def dataset_job_versions(self, job_id: str) -> Response[DatasetJobVersionsData]:
+        return self._send_request(
+            "datachain/datasets/dataset_job_versions",
+            {"job_id": job_id},
+            method="GET",
+        )
     def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
         response = self._send_request(
             "datachain/datasets/stats",
@@ -359,3 +404,10 @@ class StudioClient:
             "requirements": requirements,
         }
         return self._send_request("datachain/job", data)
+    def cancel_job(
+        self,
+        job_id: str,
+    ) -> Response[JobData]:
+        url = f"datachain/job/{job_id}/cancel"
+        return self._send_request(url, data={}, method="POST")

datachain/studio.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import asyncio
 import os
 from typing import TYPE_CHECKING, Optional
@@ -19,7 +20,7 @@ POST_LOGIN_MESSAGE = (
 )
-def process_studio_cli_args(args: "Namespace"):
+def process_studio_cli_args(args: "Namespace"):  # noqa: PLR0911
     if args.cmd == "login":
         return login(args)
     if args.cmd == "logout":
@@ -47,6 +48,9 @@ def process_studio_cli_args(args: "Namespace"):
             args.req_file,
         )
+    if args.cmd == "cancel":
+        return cancel_job(args.job_id, args.team)
     if args.cmd == "team":
         return set_team(args)
     raise DataChainError(f"Unknown command '{args.cmd}'.")
@@ -227,8 +231,34 @@ def create_job(
     if not response.data:
         raise DataChainError("Failed to create job")
-    print(f"Job {response.data.get('job', {}).get('id')} created")
+    job_id = response.data.get("job", {}).get("id")
+    print(f"Job {job_id} created")
     print("Open the job in Studio at", response.data.get("job", {}).get("url"))
+    print("=" * 40)
+    # Sync usage
+    async def _run():
+        async for message in client.tail_job_logs(job_id):
+            if "logs" in message:
+                for log in message["logs"]:
+                    print(log["message"], end="")
+            elif "job" in message:
+                print(f"\n>>>> Job is now in {message['job']['status']} status.")
+    asyncio.run(_run())
+    response = client.dataset_job_versions(job_id)
+    if not response.ok:
+        raise_remote_error(response.message)
+    response_data = response.data
+    if response_data:
+        dataset_versions = response_data.get("dataset_versions", [])
+        print("\n\n>>>> Dataset versions created during the job:")
+        for version in dataset_versions:
+            print(f"    - {version.get('dataset_name')}@v{version.get('version')}")
+    else:
+        print("No dataset versions created during the job.")
 def upload_files(client: StudioClient, files: list[str]) -> list[str]:
@@ -248,3 +278,18 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
         if file_id:
             file_ids.append(str(file_id))
     return file_ids
+def cancel_job(job_id: str, team_name: Optional[str]):
+    token = Config().read().get("studio", {}).get("token")
+    if not token:
+        raise DataChainError(
+            "Not logged in to Studio. Log in with 'datachain studio login'."
+        )
+    client = StudioClient(team=team_name)
+    response = client.cancel_job(job_id)
+    if not response.ok:
+        raise_remote_error(response.message)
+    print(f"Job {job_id} canceled")

{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.7.11
+Version: 0.8.0
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
 Requires-Dist: platformdirs
 Requires-Dist: dvc-studio-client<1,>=0.21
 Requires-Dist: tabulate
+Requires-Dist: websockets
 Provides-Extra: docs
 Requires-Dist: mkdocs>=1.5.2; extra == "docs"
 Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -98,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
 Requires-Dist: pdfplumber==0.11.4; extra == "examples"
 Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
 Requires-Dist: onnx==1.16.1; extra == "examples"
-Requires-Dist: ultralytics==8.3.48; extra == "examples"
+Requires-Dist: ultralytics==8.3.50; extra == "examples"
 ================
 |logo| DataChain

{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
 datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
 datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
 datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
-datachain/cli.py,sha256=wQiYQ_qSVCGvS06pkknT9_FIBdFRzBdeRusW9uXE3vQ,42505
+datachain/cli.py,sha256=gNXVoMfKINUhKjOpYN48tpyNBK13M0hkQWqra4jNSJQ,43137
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
 datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
 datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
@@ -14,11 +14,11 @@ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,11
 datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
 datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
 datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/studio.py,sha256=Hr0Ha0kou0so4i8i-gWiXC1AYlJ2arI1D55cc7mi3tg,7253
+datachain/studio.py,sha256=BegIXunW1n-sZtHSe3a30Mw2MXexVGRn_GU-OzjRRKM,8725
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
 datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=s4fat0jjP3JPq0RGQ9zfzRkX1JavxxCrcB1tJKMgsks,57686
+datachain/catalog/catalog.py,sha256=nuWjSIs4MO1hJa8-LQGbiMXLWWznPB_VKSVpS7368t4,58415
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
 datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
@@ -35,7 +35,7 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
 datachain/data_storage/metastore.py,sha256=hfTITcesE9XlUTxcCcdDyWGGep-QSjJL9DUxko5QCeI,37524
 datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
-datachain/data_storage/sqlite.py,sha256=D_ZQ0PHmZzHO2dinv4naVJocUDIZUwV4WAz692C1cyk,22521
+datachain/data_storage/sqlite.py,sha256=iJv1QxwVifOowtYhIDqYVoea21dvkQIdxklGNIend3c,22961
 datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
 datachain/func/__init__.py,sha256=TG6JHFKtLi06Nd5iLszXIflEq-VKZcKMdgo_KiQ8SGQ,1055
 datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
@@ -53,13 +53,14 @@ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
 datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
-datachain/lib/dc.py,sha256=qMhpVPdWeuXBDhmKKoq3fkq12Cx_ZPxDdsl_juu482o,89595
+datachain/lib/dc.py,sha256=7Wm6TEPVNCSh4bz0iA9JvEsYtYAZ9o97lK7TEJ8modE,92149
+datachain/lib/diff.py,sha256=Yurzyi7PzZzY80HOnVTpwtbWzSJ1LqN8NgZWwZOh_UU,6732
 datachain/lib/file.py,sha256=4dDWXVCHHP2uELDPHP_LheyTyyr01jwp5wp3HaOIeFI,15028
 datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
 datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
 datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
 datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
-datachain/lib/meta_formats.py,sha256=6_gB23fWlvd-edOO3UvDHvj6dBXVL61T7x8RX51FW84,6685
+datachain/lib/meta_formats.py,sha256=hDPfEkcmiLZOjhBBXuareMdnq65Wj8vZvxjmum6cROM,6377
 datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
 datachain/lib/pytorch.py,sha256=dA3r1JY0wqV_907a1D0lFaEN-7v3fMRpc1ePFE9CnvA,6168
 datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
@@ -88,7 +89,7 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
 datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
 datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
 datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
-datachain/query/dataset.py,sha256=JrImhguXj2ZDwJpfuyhcgxSIlqSPy5NmLDLc3muFQJs,54610
+datachain/query/dataset.py,sha256=fECGctERQrfLIowN9Fo6dTSnmHEe9WbfcjHRtRObcio,54667
 datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
 datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -96,7 +97,7 @@ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
 datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
 datachain/query/session.py,sha256=vvLIJ5b8eElovHLAWq_CZJXmN5t7C7iAZA7x9wPPOms,5905
 datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/remote/studio.py,sha256=WiK6fpRAw0a6Dth4XXI0YInEHH4gDU7AUHHDNd3wJzg,11616
+datachain/remote/studio.py,sha256=3DlgESETzxm3dgb6zzjjGxsddSkacT2dARnteLAfMxQ,13366
 datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
 datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
 datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -118,9 +119,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
 datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.7.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.7.11.dist-info/METADATA,sha256=ADTTf0_eJImM-tIPR-jQydM3N9Iis-ECRxWgkwLM8lU,8412
-datachain-0.7.11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-datachain-0.7.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.7.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.7.11.dist-info/RECORD,,
+datachain-0.8.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.8.0.dist-info/METADATA,sha256=PXb2pYY67bdfDjFXR7C9hwN6LaKSmseRZJNFakrWfyg,8437
+datachain-0.8.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+datachain-0.8.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.8.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.8.0.dist-info/RECORD,,

{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.7.11__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

datachain 0.7.11py3-none-any.whl → 0.8.0py3-none-any.whl