PyPI - datachain - Versions diffs - 0.7.10__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

datachain 0.7.10py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (20) hide show

datachain/catalog/catalog.py +53 -41
datachain/cli.py +25 -3
datachain/client/__init__.py +1 -2
datachain/data_storage/sqlite.py +20 -6
datachain/lib/dc.py +160 -110
datachain/lib/diff.py +197 -0
datachain/lib/file.py +2 -1
datachain/lib/meta_formats.py +40 -43
datachain/lib/pytorch.py +1 -5
datachain/lib/signal_schema.py +28 -6
datachain/query/dataset.py +5 -1
datachain/remote/studio.py +53 -1
datachain/studio.py +47 -2
datachain/toolkit/split.py +19 -6
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/METADATA +10 -10
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/RECORD +20 -19
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/LICENSE +0 -0
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/WHEEL +0 -0
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/entry_points.txt +0 -0
{datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/top_level.txt +0 -0

datachain/lib/diff.py ADDED Viewed

@@ -0,0 +1,197 @@
+import random
+import string
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional, Union
+import sqlalchemy as sa
+from datachain.lib.signal_schema import SignalSchema
+from datachain.query.schema import Column
+from datachain.sql.types import String
+if TYPE_CHECKING:
+    from datachain.lib.dc import DataChain
+C = Column
+def compare(  # noqa: PLR0912, PLR0915, C901
+    left: "DataChain",
+    right: "DataChain",
+    on: Union[str, Sequence[str]],
+    right_on: Optional[Union[str, Sequence[str]]] = None,
+    compare: Optional[Union[str, Sequence[str]]] = None,
+    right_compare: Optional[Union[str, Sequence[str]]] = None,
+    added: bool = True,
+    deleted: bool = True,
+    modified: bool = True,
+    same: bool = True,
+    status_col: Optional[str] = None,
+) -> "DataChain":
+    """Comparing two chains by identifying rows that are added, deleted, modified
+    or same"""
+    dialect = left._query.dialect
+    rname = "right_"
+    def _rprefix(c: str, rc: str) -> str:
+        """Returns prefix of right of two companion left - right columns
+        from merge. If companion columns have the same name then prefix will
+        be present in right column name, otherwise it won't.
+        """
+        return rname if c == rc else ""
+    def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
+        return [obj] if isinstance(obj, str) else list(obj)
+    if on is None:
+        raise ValueError("'on' must be specified")
+    on = _to_list(on)
+    if right_on:
+        right_on = _to_list(right_on)
+        if len(on) != len(right_on):
+            raise ValueError("'on' and 'right_on' must be have the same length")
+    if compare:
+        compare = _to_list(compare)
+    if right_compare:
+        if not compare:
+            raise ValueError("'compare' must be defined if 'right_compare' is defined")
+        right_compare = _to_list(right_compare)
+        if len(compare) != len(right_compare):
+            raise ValueError(
+                "'compare' and 'right_compare' must be have the same length"
+            )
+    if not any([added, deleted, modified, same]):
+        raise ValueError(
+            "At least one of added, deleted, modified, same flags must be set"
+        )
+    # we still need status column for internal implementation even if not
+    # needed in output
+    need_status_col = bool(status_col)
+    status_col = status_col or "diff_" + "".join(
+        random.choice(string.ascii_letters)  # noqa: S311
+        for _ in range(10)
+    )
+    # calculate on and compare column names
+    right_on = right_on or on
+    cols = left.signals_schema.clone_without_sys_signals().db_signals()
+    right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
+    on = left.signals_schema.resolve(*on).db_signals()  # type: ignore[assignment]
+    right_on = right.signals_schema.resolve(*right_on).db_signals()  # type: ignore[assignment]
+    if compare:
+        right_compare = right_compare or compare
+        compare = left.signals_schema.resolve(*compare).db_signals()  # type: ignore[assignment]
+        right_compare = right.signals_schema.resolve(*right_compare).db_signals()  # type: ignore[assignment]
+    elif not compare and len(cols) != len(right_cols):
+        # here we will mark all rows that are not added or deleted as modified since
+        # there was no explicit list of compare columns provided (meaning we need
+        # to check all columns to determine if row is modified or same), but
+        # the number of columns on left and right is not the same (one of the chains
+        # have additional column)
+        compare = None
+        right_compare = None
+    else:
+        compare = [c for c in cols if c in right_cols]  # type: ignore[misc, assignment]
+        right_compare = compare
+    diff_cond = []
+    if added:
+        added_cond = sa.and_(
+            *[
+                C(c) == None  # noqa: E711
+                for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
+            ]
+        )
+        diff_cond.append((added_cond, "A"))
+    if modified and compare:
+        modified_cond = sa.or_(
+            *[
+                C(c) != C(f"{_rprefix(c, rc)}{rc}")
+                for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
+            ]
+        )
+        diff_cond.append((modified_cond, "M"))
+    if same and compare:
+        same_cond = sa.and_(
+            *[
+                C(c) == C(f"{_rprefix(c, rc)}{rc}")
+                for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
+            ]
+        )
+        diff_cond.append((same_cond, "S"))
+    diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
+    diff.type = String()
+    left_right_merge = left.merge(
+        right, on=on, right_on=right_on, inner=False, rname=rname
+    )
+    left_right_merge_select = left_right_merge._query.select(
+        *(
+            [C(c) for c in left_right_merge.signals_schema.db_signals("sys")]
+            + [C(c) for c in on]
+            + [C(c) for c in cols if c not in on]
+            + [diff]
+        )
+    )
+    diff_col = sa.literal("D").label(status_col)
+    diff_col.type = String()
+    right_left_merge = right.merge(
+        left, on=right_on, right_on=on, inner=False, rname=rname
+    ).filter(
+        sa.and_(
+            *[C(f"{_rprefix(c, rc)}{c}") == None for c, rc in zip(on, right_on)]  # noqa: E711
+        )
+    )
+    def _default_val(chain: "DataChain", col: str):
+        col_type = chain._query.column_types[col]  # type: ignore[index]
+        val = sa.literal(col_type.default_value(dialect)).label(col)
+        val.type = col_type()
+        return val
+    right_left_merge_select = right_left_merge._query.select(
+        *(
+            [C(c) for c in right_left_merge.signals_schema.db_signals("sys")]
+            + [
+                C(c) if c == rc else _default_val(left, c)
+                for c, rc in zip(on, right_on)
+            ]
+            + [
+                C(c) if c in right_cols else _default_val(left, c)  # type: ignore[arg-type]
+                for c in cols
+                if c not in on
+            ]
+            + [diff_col]
+        )
+    )
+    if not deleted:
+        res = left_right_merge_select
+    elif deleted and not any([added, modified, same]):
+        res = right_left_merge_select
+    else:
+        res = left_right_merge_select.union(right_left_merge_select)
+    res = res.filter(C(status_col) != None)  # noqa: E711
+    schema = left.signals_schema
+    if need_status_col:
+        res = res.select()
+        schema = SignalSchema({status_col: str}) | schema
+    else:
+        res = res.select_except(C(status_col))
+    return left._evolve(query=res, signal_schema=schema)

datachain/lib/file.py CHANGED Viewed

@@ -17,7 +17,6 @@ from urllib.request import url2pathname
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from PIL import Image
-from pyarrow.dataset import dataset
 from pydantic import Field, field_validator
 from datachain.client.fileslice import FileSlice
@@ -452,6 +451,8 @@ class ArrowRow(DataModel):
     @contextmanager
     def open(self):
         """Stream row contents from indexed file."""
+        from pyarrow.dataset import dataset
         if self.file._caching_enabled:
             self.file.ensure_cached()
             path = self.file.get_local_path()

datachain/lib/meta_formats.py CHANGED Viewed

@@ -6,7 +6,6 @@ from collections.abc import Iterator
 from pathlib import Path
 from typing import Callable
-import datamodel_code_generator
 import jmespath as jsp
 from pydantic import BaseModel, ConfigDict, Field, ValidationError  # noqa: F401
@@ -39,36 +38,41 @@ def process_json(data_string, jmespath):
     return json_dict
-# Print a dynamic datamodel-codegen output from JSON or CSV on stdout
-def read_schema(source_file, data_type="csv", expr=None, model_name=None):
+def gen_datamodel_code(
+    source_file, format="json", jmespath=None, model_name=None
+) -> str:
+    """Generates Python code with Pydantic models that corresponds
+    to the provided JSON, CSV, or JSONL file.
+    It support root JSON arrays (samples the first entry).
+    """
     data_string = ""
     # using uiid to get around issue #1617
     if not model_name:
         # comply with Python class names
         uid_str = str(generate_uuid()).replace("-", "")
-        model_name = f"Model{data_type}{uid_str}"
-    try:
-        with source_file.open() as fd:  # CSV can be larger than memory
-            if data_type == "csv":
-                data_string += fd.readline().replace("\r", "")
-                data_string += fd.readline().replace("\r", "")
-            elif data_type == "jsonl":
-                data_string = fd.readline().replace("\r", "")
-            else:
-                data_string = fd.read()  # other meta must fit into RAM
-    except OSError as e:
-        print(f"An unexpected file error occurred: {e}")
-        return
-    if data_type in ("json", "jsonl"):
-        json_object = process_json(data_string, expr)
-        if data_type == "json" and isinstance(json_object, list):
+        model_name = f"Model{format}{uid_str}"
+    with source_file.open() as fd:  # CSV can be larger than memory
+        if format == "csv":
+            data_string += fd.readline().replace("\r", "")
+            data_string += fd.readline().replace("\r", "")
+        elif format == "jsonl":
+            data_string = fd.readline().replace("\r", "")
+        else:
+            data_string = fd.read()  # other meta must fit into RAM
+    if format in ("json", "jsonl"):
+        json_object = process_json(data_string, jmespath)
+        if format == "json" and isinstance(json_object, list):
             json_object = json_object[0]  # sample the 1st object from JSON array
-        if data_type == "jsonl":
-            data_type = "json"  # treat json line as plain JSON in auto-schema
+        if format == "jsonl":
+            format = "json"  # treat json line as plain JSON in auto-schema
         data_string = json.dumps(json_object)
+    import datamodel_code_generator
     input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
-    input_file_type = input_file_types[data_type]
+    input_file_type = input_file_types[format]
     with tempfile.TemporaryDirectory() as tmpdir:
         output = Path(tmpdir) / "model.py"
         datamodel_code_generator.generate(
@@ -94,36 +98,29 @@ spec = {model_name}
 def read_meta(  # noqa: C901
     spec=None,
     schema_from=None,
-    meta_type="json",
+    format="json",
     jmespath=None,
-    print_schema=False,
     model_name=None,
     nrows=None,
 ) -> Callable:
     from datachain.lib.dc import DataChain
     if schema_from:
-        chain = (
-            DataChain.from_storage(schema_from, type="text")
-            .limit(1)
-            .map(  # dummy column created (#1615)
-                meta_schema=lambda file: read_schema(
-                    file, data_type=meta_type, expr=jmespath, model_name=model_name
-                ),
-                output=str,
-            )
+        file = next(
+            DataChain.from_storage(schema_from, type="text").limit(1).collect("file")
         )
-        (model_output,) = chain.collect("meta_schema")
-        assert isinstance(model_output, str)
-        if print_schema:
-            print(f"{model_output}")
+        model_code = gen_datamodel_code(
+            file, format=format, jmespath=jmespath, model_name=model_name
+        )
+        assert isinstance(model_code, str)
         # Below 'spec' should be a dynamically converted DataModel from Pydantic
         if not spec:
             gl = globals()
-            exec(model_output, gl)  # type: ignore[arg-type] # noqa: S102
+            exec(model_code, gl)  # type: ignore[arg-type] # noqa: S102
             spec = gl["spec"]
-    if not (spec) and not (schema_from):
+    if not spec and not schema_from:
         raise ValueError(
             "Must provide a static schema in spec: or metadata sample in schema_from:"
         )
@@ -135,7 +132,7 @@ def read_meta(  # noqa: C901
     def parse_data(
         file: File,
         data_model=spec,
-        meta_type=meta_type,
+        format=format,
         jmespath=jmespath,
         nrows=nrows,
     ) -> Iterator[spec]:
@@ -147,7 +144,7 @@ def read_meta(  # noqa: C901
             except ValidationError as e:
                 print(f"Validation error occurred in row {nrow} file {file.name}:", e)
-        if meta_type == "csv":
+        if format == "csv":
             with (
                 file.open() as fd
             ):  # TODO: if schema is statically given, should allow CSV without headers
@@ -155,7 +152,7 @@ def read_meta(  # noqa: C901
                 for row in reader:  # CSV can be larger than memory
                     yield from validator(row)
-        if meta_type == "json":
+        if format == "json":
             try:
                 with file.open() as fd:  # JSON must fit into RAM
                     data_string = fd.read()
@@ -173,7 +170,7 @@ def read_meta(  # noqa: C901
                         return
                     yield from validator(json_dict, nrow)
-        if meta_type == "jsonl":
+        if format == "jsonl":
             try:
                 nrow = 0
                 with file.open() as fd:

datachain/lib/pytorch.py CHANGED Viewed

@@ -7,7 +7,6 @@ from torch import float32
 from torch.distributed import get_rank, get_world_size
 from torch.utils.data import IterableDataset, get_worker_info
 from torchvision.transforms import v2
-from tqdm import tqdm
 from datachain import Session
 from datachain.asyn import AsyncMapper
@@ -112,10 +111,7 @@ class PytorchDataset(IterableDataset):
             from datachain.lib.udf import _prefetch_input
             rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
-        desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
-        with tqdm(rows, desc=desc, unit=" rows", position=total_rank) as rows_it:
-            yield from map(self._process_row, rows_it)
+        yield from map(self._process_row, rows)
     def _process_row(self, row_features):
         row = []

datachain/lib/signal_schema.py CHANGED Viewed

@@ -402,9 +402,20 @@ class SignalSchema:
             if ModelStore.is_pydantic(finfo.annotation):
                 SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
-    def get_column_type(self, col_name: str) -> DataType:
+    def get_column_type(self, col_name: str, with_subtree: bool = False) -> DataType:
+        """
+        Returns column type by column name.
+        If `with_subtree` is True, then it will return the type of the column
+        even if it has a subtree (e.g. model with nested fields), otherwise it will
+        return the type of the column (standard type field, not the model).
+        If column is not found, raises `SignalResolvingError`.
+        """
         for path, _type, has_subtree, _ in self.get_flat_tree():
-            if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
+            if (with_subtree or not has_subtree) and DEFAULT_DELIMITER.join(
+                path
+            ) == col_name:
                 return _type
         raise SignalResolvingError([col_name], "is not found")
@@ -492,14 +503,25 @@ class SignalSchema:
                 # renaming existing signal
                 del new_values[value.name]
                 new_values[name] = self.values[value.name]
-            elif isinstance(value, Func):
+                continue
+            if isinstance(value, Column):
+                # adding new signal from existing signal field
+                try:
+                    new_values[name] = self.get_column_type(
+                        value.name, with_subtree=True
+                    )
+                    continue
+                except SignalResolvingError:
+                    pass
+            if isinstance(value, Func):
                 # adding new signal with function
                 new_values[name] = value.get_result_type(self)
-            elif isinstance(value, ColumnElement):
+                continue
+            if isinstance(value, ColumnElement):
                 # adding new signal
                 new_values[name] = sql_to_python(value)
-            else:
-                new_values[name] = value
+                continue
+            new_values[name] = value
         return SignalSchema(new_values)

datachain/query/dataset.py CHANGED Viewed

@@ -35,7 +35,6 @@ from sqlalchemy.sql.schema import TableClause
 from sqlalchemy.sql.selectable import Select
 from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
-from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
 from datachain.data_storage.schema import (
     PARTITION_COLUMN_ID,
     partition_col_names,
@@ -394,6 +393,8 @@ class UDFStep(Step, ABC):
         """
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
+        from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
         use_partitioning = self.partition_by is not None
         batching = self.udf.get_batching(use_partitioning)
         workers = self.workers
@@ -1068,6 +1069,7 @@ class DatasetQuery:
         if "sys__id" in self.column_types:
             self.column_types.pop("sys__id")
         self.starting_step = QueryStep(self.catalog, name, self.version)
+        self.dialect = self.catalog.warehouse.db.dialect
     def __iter__(self):
         return iter(self.db_results())
@@ -1087,6 +1089,8 @@ class DatasetQuery:
     def delete(
         name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
     ) -> None:
+        from datachain.catalog import get_catalog
         catalog = catalog or get_catalog()
         version = version or catalog.get_dataset(name).latest_version
         catalog.remove_dataset(name, version)

datachain/remote/studio.py CHANGED Viewed

@@ -2,7 +2,7 @@ import base64
 import json
 import logging
 import os
-from collections.abc import Iterable, Iterator
+from collections.abc import AsyncIterator, Iterable, Iterator
 from datetime import datetime, timedelta, timezone
 from struct import unpack
 from typing import (
@@ -11,6 +11,9 @@ from typing import (
     Optional,
     TypeVar,
 )
+from urllib.parse import urlparse, urlunparse
+import websockets
 from datachain.config import Config
 from datachain.dataset import DatasetStats
@@ -22,6 +25,7 @@ LsData = Optional[list[dict[str, Any]]]
 DatasetInfoData = Optional[dict[str, Any]]
 DatasetStatsData = Optional[DatasetStats]
 DatasetRowsData = Optional[Iterable[dict[str, Any]]]
+DatasetJobVersionsData = Optional[dict[str, Any]]
 DatasetExportStatus = Optional[dict[str, Any]]
 DatasetExportSignedUrls = Optional[list[str]]
 FileUploadData = Optional[dict[str, Any]]
@@ -231,6 +235,40 @@ class StudioClient:
         return msgpack.ExtType(code, data)
+    async def tail_job_logs(self, job_id: str) -> AsyncIterator[dict]:
+        """
+        Follow job logs via websocket connection.
+        Args:
+            job_id: ID of the job to follow logs for
+        Yields:
+            Dict containing either job status updates or log messages
+        """
+        parsed_url = urlparse(self.url)
+        ws_url = urlunparse(
+            parsed_url._replace(scheme="wss" if parsed_url.scheme == "https" else "ws")
+        )
+        ws_url = f"{ws_url}/logs/follow/?job_id={job_id}&team_name={self.team}"
+        async with websockets.connect(
+            ws_url,
+            additional_headers={"Authorization": f"token {self.token}"},
+        ) as websocket:
+            while True:
+                try:
+                    message = await websocket.recv()
+                    data = json.loads(message)
+                    # Yield the parsed message data
+                    yield data
+                except websockets.exceptions.ConnectionClosed:
+                    break
+                except Exception as e:  # noqa: BLE001
+                    logger.error("Error receiving websocket message: %s", e)
+                    break
     def ls(self, paths: Iterable[str]) -> Iterator[tuple[str, Response[LsData]]]:
         # TODO: change LsData (response.data value) to be list of lists
         # to handle cases where a path will be expanded (i.e. globs)
@@ -302,6 +340,13 @@ class StudioClient:
             method="GET",
         )
+    def dataset_job_versions(self, job_id: str) -> Response[DatasetJobVersionsData]:
+        return self._send_request(
+            "datachain/datasets/dataset_job_versions",
+            {"job_id": job_id},
+            method="GET",
+        )
     def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
         response = self._send_request(
             "datachain/datasets/stats",
@@ -359,3 +404,10 @@ class StudioClient:
             "requirements": requirements,
         }
         return self._send_request("datachain/job", data)
+    def cancel_job(
+        self,
+        job_id: str,
+    ) -> Response[JobData]:
+        url = f"datachain/job/{job_id}/cancel"
+        return self._send_request(url, data={}, method="POST")

datachain/studio.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import asyncio
 import os
 from typing import TYPE_CHECKING, Optional
@@ -19,7 +20,7 @@ POST_LOGIN_MESSAGE = (
 )
-def process_studio_cli_args(args: "Namespace"):
+def process_studio_cli_args(args: "Namespace"):  # noqa: PLR0911
     if args.cmd == "login":
         return login(args)
     if args.cmd == "logout":
@@ -47,6 +48,9 @@ def process_studio_cli_args(args: "Namespace"):
             args.req_file,
         )
+    if args.cmd == "cancel":
+        return cancel_job(args.job_id, args.team)
     if args.cmd == "team":
         return set_team(args)
     raise DataChainError(f"Unknown command '{args.cmd}'.")
@@ -227,8 +231,34 @@ def create_job(
     if not response.data:
         raise DataChainError("Failed to create job")
-    print(f"Job {response.data.get('job', {}).get('id')} created")
+    job_id = response.data.get("job", {}).get("id")
+    print(f"Job {job_id} created")
     print("Open the job in Studio at", response.data.get("job", {}).get("url"))
+    print("=" * 40)
+    # Sync usage
+    async def _run():
+        async for message in client.tail_job_logs(job_id):
+            if "logs" in message:
+                for log in message["logs"]:
+                    print(log["message"], end="")
+            elif "job" in message:
+                print(f"\n>>>> Job is now in {message['job']['status']} status.")
+    asyncio.run(_run())
+    response = client.dataset_job_versions(job_id)
+    if not response.ok:
+        raise_remote_error(response.message)
+    response_data = response.data
+    if response_data:
+        dataset_versions = response_data.get("dataset_versions", [])
+        print("\n\n>>>> Dataset versions created during the job:")
+        for version in dataset_versions:
+            print(f"    - {version.get('dataset_name')}@v{version.get('version')}")
+    else:
+        print("No dataset versions created during the job.")
 def upload_files(client: StudioClient, files: list[str]) -> list[str]:
@@ -248,3 +278,18 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
         if file_id:
             file_ids.append(str(file_id))
     return file_ids
+def cancel_job(job_id: str, team_name: Optional[str]):
+    token = Config().read().get("studio", {}).get("token")
+    if not token:
+        raise DataChainError(
+            "Not logged in to Studio. Log in with 'datachain studio login'."
+        )
+    client = StudioClient(team=team_name)
+    response = client.cancel_job(job_id)
+    if not response.ok:
+        raise_remote_error(response.message)
+    print(f"Job {job_id} canceled")

datachain 0.7.10__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

datachain 0.7.10py3-none-any.whl → 0.8.0py3-none-any.whl