PyPI - datachain - Versions diffs - 0.8.9__py3-none-any.whl → 0.8.11__py3-none-any.whl - Mend

datachain 0.8.9py3-none-any.whl → 0.8.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (38) hide show

datachain/cache.py +4 -4
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +102 -138
datachain/cli/__init__.py +9 -9
datachain/cli/parser/__init__.py +36 -20
datachain/cli/parser/job.py +1 -1
datachain/cli/parser/studio.py +35 -34
datachain/cli/parser/utils.py +19 -1
datachain/cli/utils.py +1 -1
datachain/client/fsspec.py +11 -8
datachain/client/local.py +4 -4
datachain/data_storage/schema.py +1 -1
datachain/data_storage/sqlite.py +38 -7
datachain/data_storage/warehouse.py +2 -2
datachain/dataset.py +1 -1
datachain/error.py +12 -0
datachain/func/__init__.py +2 -1
datachain/func/conditional.py +67 -23
datachain/func/func.py +17 -5
datachain/lib/convert/python_to_sql.py +15 -3
datachain/lib/dc.py +27 -5
datachain/lib/file.py +16 -0
datachain/lib/listing.py +30 -12
datachain/lib/pytorch.py +1 -1
datachain/lib/udf.py +1 -1
datachain/listing.py +1 -13
datachain/node.py +0 -15
datachain/nodes_fetcher.py +2 -2
datachain/query/dataset.py +8 -4
datachain/remote/studio.py +3 -3
datachain/sql/sqlite/base.py +35 -14
datachain/studio.py +8 -8
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/METADATA +3 -7
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/RECORD +38 -38
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/LICENSE +0 -0
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/WHEEL +0 -0
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/entry_points.txt +0 -0
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/top_level.txt +0 -0

datachain/lib/dc.py CHANGED Viewed

@@ -25,6 +25,7 @@ from sqlalchemy.sql.functions import GenericFunction
 from sqlalchemy.sql.sqltypes import NullType
 from datachain.dataset import DatasetRecord
+from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
 from datachain.lib.convert.python_to_sql import python_to_sql
@@ -1129,8 +1130,12 @@ class DataChain:
         )
         ```
         """
+        primitives = (bool, str, int, float)
         for col_name, expr in kwargs.items():
-            if not isinstance(expr, (Column, Func)) and isinstance(expr.type, NullType):
+            if not isinstance(expr, (*primitives, Column, Func)) and isinstance(
+                expr.type, NullType
+            ):
                 raise DataChainColumnError(
                     col_name, f"Cannot infer type with expression {expr}"
                 )
@@ -1145,6 +1150,11 @@ class DataChain:
             elif isinstance(value, Func):
                 # adding new signal
                 mutated[name] = value.get_column(schema)
+            elif isinstance(value, primitives):
+                # adding simple python constant primitives like str, int, float, bool
+                val = literal(value)
+                val.type = python_to_sql(type(value))()
+                mutated[name] = val  # type: ignore[assignment]
             else:
                 # adding new signal
                 mutated[name] = value
@@ -1332,6 +1342,7 @@ class DataChain:
         on: Union[MergeColType, Sequence[MergeColType]],
         right_on: Optional[Union[MergeColType, Sequence[MergeColType]]] = None,
         inner=False,
+        full=False,
         rname="right_",
     ) -> "Self":
         """Merge two chains based on the specified criteria.
@@ -1345,6 +1356,7 @@ class DataChain:
             right_on: Optional predicate or list of Predicates for the `right_ds`
                 to join.
             inner (bool): Whether to run inner join or outer join.
+            full (bool): Whether to run full outer join.
             rname (str): Name prefix for conflicting signal names.
         Examples:
@@ -1419,7 +1431,7 @@ class DataChain:
             )
         query = self._query.join(
-            right_ds._query, sqlalchemy.and_(*ops), inner, rname + "{name}"
+            right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
         )
         query.feature_schema = None
         ds = self._evolve(query=query)
@@ -1940,7 +1952,7 @@ class DataChain:
     def from_csv(
         cls,
         path,
-        delimiter: str = ",",
+        delimiter: Optional[str] = None,
         header: bool = True,
         output: OutputType = None,
         object_name: str = "",
@@ -1950,6 +1962,7 @@ class DataChain:
         session: Optional[Session] = None,
         settings: Optional[dict] = None,
         column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
+        parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
         **kwargs,
     ) -> "DataChain":
         """Generate chain from csv files.
@@ -1957,7 +1970,8 @@ class DataChain:
         Parameters:
             path : Storage URI with directory. URI must start with storage prefix such
                 as `s3://`, `gs://`, `az://` or "file:///".
-            delimiter : Character for delimiting columns.
+            delimiter : Character for delimiting columns. Takes precedence if also
+                specified in `parse_options`. Defaults to ",".
             header : Whether the files include a header row.
             output : Dictionary or feature class defining column names and their
                 corresponding types. List of column names is also accepted, in which
@@ -1971,6 +1985,8 @@ class DataChain:
             column_types : Dictionary of column names and their corresponding types.
                 It is passed to CSV reader and for each column specified type auto
                 inference is disabled.
+            parse_options: Tells the parser how to process lines.
+                See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
         Example:
             Reading a csv file:
@@ -1988,6 +2004,12 @@ class DataChain:
         from pyarrow.dataset import CsvFileFormat
         from pyarrow.lib import type_for_alias
+        parse_options = parse_options or {}
+        if "delimiter" not in parse_options:
+            parse_options["delimiter"] = ","
+        if delimiter:
+            parse_options["delimiter"] = delimiter
         if column_types:
             column_types = {
                 name: type_for_alias(typ) if isinstance(typ, str) else typ
@@ -2015,7 +2037,7 @@ class DataChain:
                 msg = f"error parsing csv - incompatible output type {type(output)}"
                 raise DatasetPrepareError(chain.name, msg)
-        parse_options = ParseOptions(delimiter=delimiter)
+        parse_options = ParseOptions(**parse_options)
         read_options = ReadOptions(column_names=column_names)
         convert_options = ConvertOptions(
             strings_can_be_null=True,

datachain/lib/file.py CHANGED Viewed

@@ -190,6 +190,22 @@ class File(DataModel):
         self._catalog = None
         self._caching_enabled: bool = False
+    @classmethod
+    def upload(
+        cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
+    ) -> "File":
+        if catalog is None:
+            from datachain.catalog.loader import get_catalog
+            catalog = get_catalog()
+        parent, name = posixpath.split(path)
+        client = catalog.get_client(parent)
+        file = client.upload(data, name)
+        file._set_stream(catalog)
+        return file
     @classmethod
     def _from_row(cls, row: "RowDict") -> "Self":
         return cls(**{key: row[key] for key in cls._datachain_column_types})

datachain/lib/listing.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import logging
+import os
 import posixpath
 from collections.abc import Iterator
 from typing import TYPE_CHECKING, Callable, Optional, TypeVar
@@ -7,6 +9,7 @@ from sqlalchemy.sql.expression import true
 from datachain.asyn import iter_over_async
 from datachain.client import Client
+from datachain.error import REMOTE_ERRORS, ClientError
 from datachain.lib.file import File
 from datachain.query.schema import Column
 from datachain.sql.functions import path as pathfunc
@@ -22,6 +25,10 @@ LISTING_PREFIX = "lst__"  # listing datasets start with this name
 D = TypeVar("D", bound="DataChain")
+# Disable warnings for remote errors in clients
+logging.getLogger("aiobotocore.credentials").setLevel(logging.CRITICAL)
+logging.getLogger("gcsfs").setLevel(logging.CRITICAL)
 def list_bucket(uri: str, cache, client_config=None) -> Callable:
     """
@@ -90,6 +97,15 @@ def _isfile(client: "Client", path: str) -> bool:
     Returns True if uri points to a file
     """
     try:
+        if "://" in path:
+            # This makes sure that the uppercase scheme is converted to lowercase
+            scheme, path = path.split("://", 1)
+            path = f"{scheme.lower()}://{path}"
+        if os.name == "nt" and "*" in path:
+            # On Windows, the glob pattern "*" is not supported
+            return False
         info = client.fs.info(path)
         name = info.get("name")
         # case for special simulated directories on some clouds
@@ -99,21 +115,21 @@ def _isfile(client: "Client", path: str) -> bool:
             return False
         return info["type"] == "file"
-    except:  # noqa: E722
+    except FileNotFoundError:
         return False
+    except REMOTE_ERRORS as e:
+        raise ClientError(
+            message=str(e),
+            error_code=getattr(e, "code", None),
+        ) from e
-def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], str, str]:
+def parse_listing_uri(uri: str, client_config) -> tuple[str, str, str]:
     """
     Parsing uri and returns listing dataset name, listing uri and listing path
     """
     client_config = client_config or {}
-    client = Client.get_client(uri, cache, **client_config)
     storage_uri, path = Client.parse_url(uri)
-    telemetry.log_param("client", client.PREFIX)
-    if not uri.endswith("/") and _isfile(client, uri):
-        return None, f"{storage_uri}/{path.lstrip('/')}", path
     if uses_glob(path):
         lst_uri_path = posixpath.dirname(path)
     else:
@@ -157,13 +173,15 @@ def get_listing(
     client_config = catalog.client_config
     client = Client.get_client(uri, cache, **client_config)
-    ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
-    listing = None
+    telemetry.log_param("client", client.PREFIX)
-    # if we don't want to use cached dataset (e.g. for a single file listing)
-    if not ds_name:
-        return None, list_uri, list_path, False
+    # we don't want to use cached dataset (e.g. for a single file listing)
+    if not uri.endswith("/") and _isfile(client, uri):
+        storage_uri, path = Client.parse_url(uri)
+        return None, f"{storage_uri}/{path.lstrip('/')}", path, False
+    ds_name, list_uri, list_path = parse_listing_uri(uri, client_config)
+    listing = None
     listings = [
         ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
     ]

datachain/lib/pytorch.py CHANGED Viewed

@@ -23,7 +23,7 @@ from datachain.query.dataset import get_download_callback
 if TYPE_CHECKING:
     from torchvision.transforms.v2 import Transform
-    from datachain.cache import DataChainCache as Cache
+    from datachain.cache import Cache
 logger = logging.getLogger("datachain")

datachain/lib/udf.py CHANGED Viewed

@@ -32,7 +32,7 @@ if TYPE_CHECKING:
     from typing_extensions import Self
-    from datachain.cache import DataChainCache as Cache
+    from datachain.cache import Cache
     from datachain.catalog import Catalog
     from datachain.lib.signal_schema import SignalSchema
     from datachain.lib.udf_signature import UdfSignature

datachain/listing.py CHANGED Viewed

@@ -2,7 +2,6 @@ import glob
 import os
 from collections.abc import Iterable, Iterator
 from functools import cached_property
-from itertools import zip_longest
 from typing import TYPE_CHECKING, Optional
 from sqlalchemy import Column
@@ -101,11 +100,8 @@ class Listing:
         copy_to_filename: Optional[str],
         recursive=False,
         copy_dir_contents=False,
-        relative_path=None,
-        from_edatachain=False,
         from_dataset=False,
     ) -> list[NodeWithPath]:
-        rel_path_elements = relative_path.split("/") if relative_path else []
         all_nodes: list[NodeWithPath] = []
         for src in sources:
             node = src.node
@@ -119,15 +115,7 @@ class Listing:
                 )
             else:
                 node_path = []
-                if from_edatachain:
-                    for rpe, npe in zip_longest(
-                        rel_path_elements, node.path.split("/")
-                    ):
-                        if rpe == npe:
-                            continue
-                        if npe:
-                            node_path.append(npe)
-                elif copy_to_filename:
+                if copy_to_filename:
                     node_path = [os.path.basename(copy_to_filename)]
                 elif from_dataset:
                     node_path = [

datachain/node.py CHANGED Viewed

@@ -84,18 +84,6 @@ class Node:
         fd.write(f"  size: {self.size}\n")
         return size
-    def get_metafile_data(self, path: str):
-        data: dict[str, Any] = {
-            "name": path,
-            "etag": self.etag,
-        }
-        version = self.version
-        if version:
-            data["version"] = version
-        data["last_modified"] = time_to_str(self.last_modified)
-        data["size"] = self.size
-        return data
     @property
     def full_path(self) -> str:
         if self.is_dir and self.path:
@@ -181,9 +169,6 @@ class NodeWithPath:
     def append_to_file(self, fd):
         return self.n.append_to_file(fd, "/".join(self.path))
-    def get_metafile_data(self):
-        return self.n.get_metafile_data("/".join(self.path))
     @property
     def full_path(self) -> str:
         path = "/".join(self.path)

datachain/nodes_fetcher.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
 from datachain.nodes_thread_pool import NodesThreadPool
 if TYPE_CHECKING:
-    from datachain.cache import DataChainCache
+    from datachain.cache import Cache
     from datachain.client.fsspec import Client
     from datachain.node import Node
@@ -13,7 +13,7 @@ logger = logging.getLogger("datachain")
 class NodesFetcher(NodesThreadPool):
-    def __init__(self, client: "Client", max_threads: int, cache: "DataChainCache"):
+    def __init__(self, client: "Client", max_threads: int, cache: "Cache"):
         super().__init__(max_threads)
         self.client = client
         self.cache = cache

datachain/query/dataset.py CHANGED Viewed

@@ -875,6 +875,7 @@ class SQLJoin(Step):
     query2: "DatasetQuery"
     predicates: Union[JoinPredicateType, tuple[JoinPredicateType, ...]]
     inner: bool
+    full: bool
     rname: str
     def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
@@ -977,14 +978,14 @@ class SQLJoin(Step):
         self.validate_expression(join_expression, q1, q2)
         def q(*columns):
-            join_query = self.catalog.warehouse.join(
+            return self.catalog.warehouse.join(
                 q1,
                 q2,
                 join_expression,
                 inner=self.inner,
+                full=self.full,
+                columns=columns,
             )
-            return sqlalchemy.select(*columns).select_from(join_query)
-            # return sqlalchemy.select(*subquery.c).select_from(subquery)
         return step_result(
             q,
@@ -1489,6 +1490,7 @@ class DatasetQuery:
         dataset_query: "DatasetQuery",
         predicates: Union[JoinPredicateType, Sequence[JoinPredicateType]],
         inner=False,
+        full=False,
         rname="{name}_right",
     ) -> "Self":
         left = self.clone(new_table=False)
@@ -1504,7 +1506,9 @@ class DatasetQuery:
             if isinstance(predicates, (str, ColumnClause, ColumnElement))
             else tuple(predicates)
         )
-        new_query.steps = [SQLJoin(self.catalog, left, right, predicates, inner, rname)]
+        new_query.steps = [
+            SQLJoin(self.catalog, left, right, predicates, inner, full, rname)
+        ]
         return new_query
     @detach

datachain/remote/studio.py CHANGED Viewed

@@ -75,7 +75,7 @@ class StudioClient:
         if not token:
             raise DataChainError(
-                "Studio token is not set. Use `datachain studio login` "
+                "Studio token is not set. Use `datachain auth login` "
                 "or environment variable `DVC_STUDIO_TOKEN` to set it."
             )
@@ -105,7 +105,7 @@ class StudioClient:
         if not team:
             raise DataChainError(
                 "Studio team is not set. "
-                "Use `datachain studio team <team_name>` "
+                "Use `datachain auth team <team_name>` "
                 "or environment variable `DVC_STUDIO_TEAM` to set it."
                 "You can also set it in the config file as team under studio."
             )
@@ -375,7 +375,7 @@ class StudioClient:
             method="GET",
         )
-    def upload_file(self, file_name: str, content: bytes) -> Response[FileUploadData]:
+    def upload_file(self, content: bytes, file_name: str) -> Response[FileUploadData]:
         data = {
             "file_content": base64.b64encode(content).decode("utf-8"),
             "file_name": file_name,

datachain/sql/sqlite/base.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sqlite3
 import warnings
 from collections.abc import Iterable
 from datetime import MAXYEAR, MINYEAR, datetime, timezone
+from functools import cache
 from types import MappingProxyType
 from typing import Callable, Optional
@@ -526,24 +527,44 @@ def compile_collect(element, compiler, **kwargs):
     return compiler.process(func.json_group_array(*element.clauses.clauses), **kwargs)
-def load_usearch_extension(conn: sqlite3.Connection) -> bool:
+@cache
+def usearch_sqlite_path() -> Optional[str]:
     try:
-        # usearch is part of the vector optional dependencies
-        # we use the extension's cosine and euclidean distance functions
-        from usearch import sqlite_path
+        import usearch
+    except ImportError:
+        return None
-        conn.enable_load_extension(True)
+    with warnings.catch_warnings():
+        # usearch binary is not available for Windows, see: https://github.com/unum-cloud/usearch/issues/427.
+        # and, sometimes fail to download the binary in other platforms
+        # triggering UserWarning.
-        with warnings.catch_warnings():
-            # usearch binary is not available for Windows, see: https://github.com/unum-cloud/usearch/issues/427.
-            # and, sometimes fail to download the binary in other platforms
-            # triggering UserWarning.
+        warnings.filterwarnings("ignore", category=UserWarning, module="usearch")
-            warnings.filterwarnings("ignore", category=UserWarning, module="usearch")
-            conn.load_extension(sqlite_path())
+        try:
+            return usearch.sqlite_path()
+        except FileNotFoundError:
+            return None
-        conn.enable_load_extension(False)
-        return True
-    except Exception:  # noqa: BLE001
+def load_usearch_extension(conn: sqlite3.Connection) -> bool:
+    # usearch is part of the vector optional dependencies
+    # we use the extension's cosine and euclidean distance functions
+    ext_path = usearch_sqlite_path()
+    if ext_path is None:
+        return False
+    try:
+        conn.enable_load_extension(True)
+    except AttributeError:
+        # sqlite3 module is not built with loadable extension support by default.
+        return False
+    try:
+        conn.load_extension(ext_path)
+    except sqlite3.OperationalError:
         return False
+    else:
+        return True
+    finally:
+        conn.enable_load_extension(False)

datachain/studio.py CHANGED Viewed

@@ -47,7 +47,7 @@ def process_jobs_args(args: "Namespace"):
     raise DataChainError(f"Unknown command '{args.cmd}'.")
-def process_studio_cli_args(args: "Namespace"):
+def process_auth_cli_args(args: "Namespace"):
     if args.cmd is None:
         print(
             f"Use 'datachain {args.command} --help' to see available options",
@@ -95,7 +95,7 @@ def login(args: "Namespace"):
         raise DataChainError(
             "Token already exists. "
             "To login with a different token, "
-            "logout using `datachain studio logout`."
+            "logout using `datachain auth logout`."
         )
     open_browser = not args.no_open
@@ -121,12 +121,12 @@ def logout():
         token = conf.get("studio", {}).get("token")
         if not token:
             raise DataChainError(
-                "Not logged in to Studio. Log in with 'datachain studio login'."
+                "Not logged in to Studio. Log in with 'datachain auth login'."
             )
         del conf["studio"]["token"]
-    print("Logged out from Studio. (you can log back in with 'datachain studio login')")
+    print("Logged out from Studio. (you can log back in with 'datachain auth login')")
 def token():
@@ -134,7 +134,7 @@ def token():
     token = config.get("token")
     if not token:
         raise DataChainError(
-            "Not logged in to Studio. Log in with 'datachain studio login'."
+            "Not logged in to Studio. Log in with 'datachain auth login'."
         )
     print(token)
@@ -282,7 +282,7 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
         file_name = os.path.basename(file)
         with open(file, "rb") as f:
             file_content = f.read()
-        response = client.upload_file(file_name, file_content)
+        response = client.upload_file(file_content, file_name)
         if not response.ok:
             raise_remote_error(response.message)
@@ -299,7 +299,7 @@ def cancel_job(job_id: str, team_name: Optional[str]):
     token = Config().read().get("studio", {}).get("token")
     if not token:
         raise DataChainError(
-            "Not logged in to Studio. Log in with 'datachain studio login'."
+            "Not logged in to Studio. Log in with 'datachain auth login'."
         )
     client = StudioClient(team=team_name)
@@ -314,7 +314,7 @@ def show_job_logs(job_id: str, team_name: Optional[str]):
     token = Config().read().get("studio", {}).get("token")
     if not token:
         raise DataChainError(
-            "Not logged in to Studio. Log in with 'datachain studio login'."
+            "Not logged in to Studio. Log in with 'datachain auth login'."
         )
     client = StudioClient(team=team_name)

{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: datachain
-Version: 0.8.9
+Version: 0.8.11
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -78,7 +78,6 @@ Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
 Requires-Dist: virtualenv; extra == "tests"
 Requires-Dist: dulwich; extra == "tests"
 Requires-Dist: hypothesis; extra == "tests"
-Requires-Dist: open_clip_torch; extra == "tests"
 Requires-Dist: aiotools>=1.7.0; extra == "tests"
 Requires-Dist: requests-mock; extra == "tests"
 Requires-Dist: scipy; extra == "tests"
@@ -94,12 +93,9 @@ Provides-Extra: examples
 Requires-Dist: datachain[tests]; extra == "examples"
 Requires-Dist: defusedxml; extra == "examples"
 Requires-Dist: accelerate; extra == "examples"
-Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
-Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
-Requires-Dist: pdfplumber==0.11.5; extra == "examples"
 Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
-Requires-Dist: onnx==1.16.1; extra == "examples"
-Requires-Dist: ultralytics==8.3.61; extra == "examples"
+Requires-Dist: ultralytics==8.3.68; extra == "examples"
+Requires-Dist: open_clip_torch; extra == "examples"
 ================
 |logo| DataChain

datachain 0.8.9__py3-none-any.whl → 0.8.11__py3-none-any.whl

Potentially problematic release.

datachain 0.8.9py3-none-any.whl → 0.8.11py3-none-any.whl