PyPI - datachain - Versions diffs - 0.8.8__py3-none-any.whl → 0.8.10__py3-none-any.whl - Mend

datachain 0.8.8py3-none-any.whl → 0.8.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (32) hide show

datachain/cli/__init__.py +14 -7
datachain/cli/commands/datasets.py +2 -3
datachain/cli/parser/__init__.py +69 -82
datachain/cli/parser/job.py +20 -25
datachain/cli/parser/studio.py +41 -65
datachain/cli/parser/utils.py +1 -1
datachain/cli/utils.py +1 -1
datachain/client/local.py +1 -1
datachain/data_storage/sqlite.py +38 -7
datachain/data_storage/warehouse.py +2 -2
datachain/lib/arrow.py +1 -1
datachain/lib/convert/python_to_sql.py +15 -3
datachain/lib/convert/unflatten.py +1 -2
datachain/lib/dc.py +26 -5
datachain/lib/file.py +27 -4
datachain/lib/listing.py +4 -4
datachain/lib/pytorch.py +3 -1
datachain/lib/udf.py +56 -20
datachain/model/bbox.py +9 -9
datachain/model/pose.py +9 -9
datachain/model/segment.py +6 -6
datachain/progress.py +0 -13
datachain/query/dataset.py +20 -14
datachain/remote/studio.py +2 -2
datachain/sql/sqlite/base.py +35 -14
datachain/studio.py +22 -16
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/METADATA +4 -3
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/RECORD +32 -32
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/LICENSE +0 -0
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/WHEEL +0 -0
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/entry_points.txt +0 -0
{datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/top_level.txt +0 -0

datachain/cli/parser/studio.py CHANGED Viewed

@@ -1,34 +1,31 @@
-def add_studio_parser(subparsers, parent_parser) -> None:
-    studio_help = "Commands to authenticate DataChain with Iterative Studio"
-    studio_description = (
-        "Authenticate DataChain with Studio and set the token. "
-        "Once this token has been properly configured,\n"
-        "DataChain will utilize it for seamlessly sharing datasets\n"
-        "and using Studio features from CLI"
+def add_auth_parser(subparsers, parent_parser) -> None:
+    auth_help = "Manage Studio authentication"
+    auth_description = (
+        "Manage authentication and settings for Studio. "
+        "Configure tokens for sharing datasets and using Studio features."
     )
-    studio_parser = subparsers.add_parser(
-        "studio",
+    auth_parser = subparsers.add_parser(
+        "auth",
         parents=[parent_parser],
-        description=studio_description,
-        help=studio_help,
+        description=auth_description,
+        help=auth_help,
     )
-    studio_subparser = studio_parser.add_subparsers(
+    auth_subparser = auth_parser.add_subparsers(
         dest="cmd",
-        help="Use `DataChain studio CMD --help` to display command-specific help.",
-        required=True,
+        help="Use `datachain auth CMD --help` to display command-specific help",
     )
-    studio_login_help = "Authenticate DataChain with Studio host"
-    studio_login_description = (
-        "By default, this command authenticates the DataChain with Studio\n"
-        "using default scopes and assigns a random name as the token name."
+    auth_login_help = "Authenticate with Studio"
+    auth_login_description = (
+        "Authenticate with Studio using default scopes. "
+        "A random name will be assigned as the token name if not specified."
     )
-    login_parser = studio_subparser.add_parser(
+    login_parser = auth_subparser.add_parser(
         "login",
         parents=[parent_parser],
-        description=studio_login_description,
-        help=studio_login_help,
+        description=auth_login_description,
+        help=auth_login_help,
     )
     login_parser.add_argument(
@@ -36,14 +33,14 @@ def add_studio_parser(subparsers, parent_parser) -> None:
         "--hostname",
         action="store",
         default=None,
-        help="The hostname of the Studio instance to authenticate with.",
+        help="Hostname of the Studio instance",
     )
     login_parser.add_argument(
         "-s",
         "--scopes",
         action="store",
         default=None,
-        help="The scopes for the authentication token. ",
+        help="Authentication token scopes",
     )
     login_parser.add_argument(
@@ -51,76 +48,55 @@ def add_studio_parser(subparsers, parent_parser) -> None:
         "--name",
         action="store",
         default=None,
-        help="The name of the authentication token. It will be used to\n"
-        "identify token shown in Studio profile.",
+        help="Authentication token name (shown in Studio profile)",
     )
     login_parser.add_argument(
         "--no-open",
         action="store_true",
         default=False,
-        help="Use authentication flow based on user code.\n"
-        "You will be presented with user code to enter in browser.\n"
-        "DataChain will also use this if it cannot launch browser on your behalf.",
+        help="Use code-based authentication without browser",
     )
-    studio_logout_help = "Logout user from Studio"
-    studio_logout_description = "This removes the studio token from your global config."
+    auth_logout_help = "Log out from Studio"
+    auth_logout_description = (
+        "Remove the Studio authentication token from global config."
+    )
-    studio_subparser.add_parser(
+    auth_subparser.add_parser(
         "logout",
         parents=[parent_parser],
-        description=studio_logout_description,
-        help=studio_logout_help,
+        description=auth_logout_description,
+        help=auth_logout_help,
     )
-    studio_team_help = "Set the default team for DataChain"
-    studio_team_description = (
-        "Set the default team for DataChain to use when interacting with Studio."
-    )
+    auth_team_help = "Set default team for Studio operations"
+    auth_team_description = "Set the default team for Studio operations."
-    team_parser = studio_subparser.add_parser(
+    team_parser = auth_subparser.add_parser(
         "team",
         parents=[parent_parser],
-        description=studio_team_description,
-        help=studio_team_help,
+        description=auth_team_description,
+        help=auth_team_help,
     )
     team_parser.add_argument(
         "team_name",
         action="store",
-        help="The name of the team to set as the default.",
+        help="Name of the team to set as default",
     )
     team_parser.add_argument(
         "--global",
         action="store_true",
         default=False,
-        help="Set the team globally for all DataChain projects.",
+        help="Set team globally for all projects",
     )
-    studio_token_help = "View the token datachain uses to contact Studio"  # noqa: S105 # nosec B105
+    auth_token_help = "View Studio authentication token"  # noqa: S105
+    auth_token_description = "Display the current authentication token for Studio."  # noqa: S105
-    studio_subparser.add_parser(
+    auth_subparser.add_parser(
         "token",
         parents=[parent_parser],
-        description=studio_token_help,
-        help=studio_token_help,
-    )
-    studio_ls_dataset_help = "List the available datasets from Studio"
-    studio_ls_dataset_description = (
-        "This command lists all the datasets available in Studio.\n"
-        "It will show the dataset name and the number of versions available."
-    )
-    ls_dataset_parser = studio_subparser.add_parser(
-        "dataset",
-        parents=[parent_parser],
-        description=studio_ls_dataset_description,
-        help=studio_ls_dataset_help,
-    )
-    ls_dataset_parser.add_argument(
-        "--team",
-        action="store",
-        default=None,
-        help="The team to list datasets for. By default, it will use team from config.",
+        description=auth_token_description,
+        help=auth_token_help,
     )

datachain/cli/parser/utils.py CHANGED Viewed

@@ -30,7 +30,7 @@ def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Act
         "sources",
         type=str,
         nargs=nargs,
-        help="Data sources - paths to cloud storage dirs",
+        help="Data sources - paths to source storage directories or files",
     )

datachain/cli/utils.py CHANGED Viewed

@@ -87,7 +87,7 @@ def get_logging_level(args: Namespace) -> int:
 def determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
     if studio and not token:
         raise DataChainError(
-            "Not logged in to Studio. Log in with 'datachain studio login'."
+            "Not logged in to Studio. Log in with 'datachain auth login'."
         )
     if local or studio:

datachain/client/local.py CHANGED Viewed

@@ -38,7 +38,7 @@ class FileClient(Client):
     def get_uri(cls, name: str) -> "StorageURI":
         from datachain.dataset import StorageURI
-        return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
+        return StorageURI(f"{cls.PREFIX}/{name.removeprefix('/')}")
     @classmethod
     def ls_buckets(cls, **kwargs):

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -19,6 +19,7 @@ from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
 from sqlalchemy.dialects import sqlite
 from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
 from sqlalchemy.sql import func
+from sqlalchemy.sql.elements import BinaryExpression, BooleanClauseList
 from sqlalchemy.sql.expression import bindparam, cast
 from sqlalchemy.sql.selectable import Select
 from tqdm.auto import tqdm
@@ -40,7 +41,6 @@ if TYPE_CHECKING:
     from sqlalchemy.schema import SchemaItem
     from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
     from sqlalchemy.sql.elements import ColumnElement
-    from sqlalchemy.sql.selectable import Join
     from sqlalchemy.types import TypeEngine
     from datachain.lib.file import File
@@ -654,16 +654,47 @@ class SQLiteWarehouse(AbstractWarehouse):
         right: "_FromClauseArgument",
         onclause: "_OnClauseArgument",
         inner: bool = True,
-    ) -> "Join":
+        full: bool = False,
+        columns=None,
+    ) -> "Select":
         """
         Join two tables together.
         """
-        return sqlalchemy.join(
-            left,
-            right,
-            onclause,
-            isouter=not inner,
+        if not full:
+            join_query = sqlalchemy.join(
+                left,
+                right,
+                onclause,
+                isouter=not inner,
+            )
+            return sqlalchemy.select(*columns).select_from(join_query)
+        left_right_join = sqlalchemy.select(*columns).select_from(
+            sqlalchemy.join(left, right, onclause, isouter=True)
         )
+        right_left_join = sqlalchemy.select(*columns).select_from(
+            sqlalchemy.join(right, left, onclause, isouter=True)
+        )
+        def add_left_rows_filter(exp: BinaryExpression):
+            """
+            Adds filter to right_left_join to remove unmatched left table rows by
+            getting column names that need to be NULL from BinaryExpressions in onclause
+            """
+            return right_left_join.where(
+                getattr(left.c, exp.left.name) == None  # type: ignore[union-attr] # noqa: E711
+            )
+        if isinstance(onclause, BinaryExpression):
+            right_left_join = add_left_rows_filter(onclause)
+        if isinstance(onclause, BooleanClauseList):
+            for c in onclause.get_children():
+                if isinstance(c, BinaryExpression):
+                    right_left_join = add_left_rows_filter(c)
+        union = sqlalchemy.union(left_right_join, right_left_join).subquery()
+        return sqlalchemy.select(*union.c).select_from(union)
     def create_pre_udf_table(self, query: "Select") -> "Table":
         """

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -31,7 +31,7 @@ if TYPE_CHECKING:
         _FromClauseArgument,
         _OnClauseArgument,
     )
-    from sqlalchemy.sql.selectable import Join, Select
+    from sqlalchemy.sql.selectable import Select
     from sqlalchemy.types import TypeEngine
     from datachain.data_storage import schema
@@ -873,7 +873,7 @@ class AbstractWarehouse(ABC, Serializable):
         right: "_FromClauseArgument",
         onclause: "_OnClauseArgument",
         inner: bool = True,
-    ) -> "Join":
+    ) -> "Select":
         """
         Join two tables together.
         """

datachain/lib/arrow.py CHANGED Viewed

@@ -33,7 +33,7 @@ class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
         # reads the whole file in-memory.
         (uri,) = self.references[path]
         protocol, _ = split_protocol(uri)
-        return self.fss[protocol]._open(uri, mode, *args, **kwargs)
+        return self.fss[protocol].open(uri, mode, *args, **kwargs)
 class ArrowGenerator(Generator):

datachain/lib/convert/python_to_sql.py CHANGED Viewed

@@ -52,15 +52,15 @@ def python_to_sql(typ):  # noqa: PLR0911
     args = get_args(typ)
     if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
-        if args is None or len(args) != 1:
+        if args is None:
             raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
         args0 = args[0]
         if ModelStore.is_pydantic(args0):
             return Array(JSON())
-        next_type = python_to_sql(args0)
-        return Array(next_type)
+        list_type = list_of_args_to_type(args)
+        return Array(list_type)
     if orig is Annotated:
         # Ignoring annotations
@@ -82,6 +82,18 @@ def python_to_sql(typ):  # noqa: PLR0911
     raise TypeError(f"Cannot recognize type {typ}")
+def list_of_args_to_type(args) -> SQLType:
+    first_type = python_to_sql(args[0])
+    for next_arg in args[1:]:
+        try:
+            next_type = python_to_sql(next_arg)
+            if next_type != first_type:
+                return JSON()
+        except TypeError:
+            return JSON()
+    return first_type
 def _is_json_inside_union(orig, args) -> bool:
     if orig == Union and len(args) >= 2:
         # List in JSON: Union[dict, list[dict]]

datachain/lib/convert/unflatten.py CHANGED Viewed

@@ -35,8 +35,7 @@ def unflatten_to_json_pos(
 def _normalize(name: str) -> str:
     if DEFAULT_DELIMITER in name:
         raise RuntimeError(
-            f"variable '{name}' cannot be used "
-            f"because it contains {DEFAULT_DELIMITER}"
+            f"variable '{name}' cannot be used because it contains {DEFAULT_DELIMITER}"
         )
     return _to_snake_case(name)

datachain/lib/dc.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import (
     BinaryIO,
     Callable,
     ClassVar,
+    Literal,
     Optional,
     TypeVar,
     Union,
@@ -1276,7 +1277,12 @@ class DataChain:
                 yield ret[0] if len(cols) == 1 else tuple(ret)
     def to_pytorch(
-        self, transform=None, tokenizer=None, tokenizer_kwargs=None, num_samples=0
+        self,
+        transform=None,
+        tokenizer=None,
+        tokenizer_kwargs=None,
+        num_samples=0,
+        remove_prefetched: bool = False,
     ):
         """Convert to pytorch dataset format.
@@ -1286,6 +1292,7 @@ class DataChain:
             tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
             num_samples (int): Number of random samples to draw for each epoch.
                 This argument is ignored if `num_samples=0` (the default).
+            remove_prefetched (bool): Whether to remove prefetched files after reading.
         Example:
             ```py
@@ -1312,6 +1319,7 @@ class DataChain:
             tokenizer_kwargs=tokenizer_kwargs,
             num_samples=num_samples,
             dc_settings=chain._settings,
+            remove_prefetched=remove_prefetched,
         )
     def remove_file_signals(self) -> "Self":  # noqa: D102
@@ -1324,6 +1332,7 @@ class DataChain:
         on: Union[MergeColType, Sequence[MergeColType]],
         right_on: Optional[Union[MergeColType, Sequence[MergeColType]]] = None,
         inner=False,
+        full=False,
         rname="right_",
     ) -> "Self":
         """Merge two chains based on the specified criteria.
@@ -1337,6 +1346,7 @@ class DataChain:
             right_on: Optional predicate or list of Predicates for the `right_ds`
                 to join.
             inner (bool): Whether to run inner join or outer join.
+            full (bool): Whether to run full outer join.
             rname (str): Name prefix for conflicting signal names.
         Examples:
@@ -1411,7 +1421,7 @@ class DataChain:
             )
         query = self._query.join(
-            right_ds._query, sqlalchemy.and_(*ops), inner, rname + "{name}"
+            right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
         )
         query.feature_schema = None
         ds = self._evolve(query=query)
@@ -2415,11 +2425,22 @@ class DataChain:
     def export_files(
         self,
         output: str,
-        signal="file",
+        signal: str = "file",
         placement: FileExportPlacement = "fullpath",
         use_cache: bool = True,
+        link_type: Literal["copy", "symlink"] = "copy",
     ) -> None:
-        """Method that exports all files from chain to some folder."""
+        """Export files from a specified signal to a directory.
+        Args:
+            output: Path to the target directory for exporting files.
+            signal: Name of the signal to export files from.
+            placement: The method to use for naming exported files.
+                The possible values are: "filename", "etag", "fullpath", and "checksum".
+            use_cache: If `True`, cache the files before exporting.
+            link_type: Method to use for exporting files.
+                Falls back to `'copy'` if symlinking fails.
+        """
         if placement == "filename" and (
             self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
             != self._query.count()
@@ -2427,7 +2448,7 @@ class DataChain:
             raise ValueError("Files with the same name found")
         for file in self.collect(signal):
-            file.export(output, placement, use_cache)  # type: ignore[union-attr]
+            file.export(output, placement, use_cache, link_type=link_type)  # type: ignore[union-attr]
     def shuffle(self) -> "Self":
         """Shuffle the rows of the chain deterministically."""

datachain/lib/file.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import errno
 import hashlib
 import io
 import json
@@ -76,18 +77,18 @@ class TarVFile(VFile):
     def open(cls, file: "File", location: list[dict]):
         """Stream file from tar archive based on location in archive."""
         if len(location) > 1:
-            VFileError(file, "multiple 'location's are not supported yet")
+            raise VFileError(file, "multiple 'location's are not supported yet")
         loc = location[0]
         if (offset := loc.get("offset", None)) is None:
-            VFileError(file, "'offset' is not specified")
+            raise VFileError(file, "'offset' is not specified")
         if (size := loc.get("size", None)) is None:
-            VFileError(file, "'size' is not specified")
+            raise VFileError(file, "'size' is not specified")
         if (parent := loc.get("parent", None)) is None:
-            VFileError(file, "'parent' is not specified")
+            raise VFileError(file, "'parent' is not specified")
         tar_file = File(**parent)
         tar_file._set_stream(file._catalog)
@@ -236,11 +237,26 @@ class File(DataModel):
         with open(destination, mode="wb") as f:
             f.write(self.read())
+    def _symlink_to(self, destination: str):
+        if self.location:
+            raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
+        if self._caching_enabled:
+            self.ensure_cached()
+            source = self.get_local_path()
+            assert source, "File was not cached"
+        elif self.source.startswith("file://"):
+            source = self.get_path()
+        else:
+            raise OSError(errno.EXDEV, "can't link across filesystems")
+        return os.symlink(source, destination)
     def export(
         self,
         output: str,
         placement: ExportPlacement = "fullpath",
         use_cache: bool = True,
+        link_type: Literal["copy", "symlink"] = "copy",
     ) -> None:
         """Export file to new location."""
         if use_cache:
@@ -249,6 +265,13 @@ class File(DataModel):
         dst_dir = os.path.dirname(dst)
         os.makedirs(dst_dir, exist_ok=True)
+        if link_type == "symlink":
+            try:
+                return self._symlink_to(dst)
+            except OSError as exc:
+                if exc.errno not in (errno.ENOTSUP, errno.EXDEV, errno.ENOSYS):
+                    raise
         self.save(dst)
     def _set_stream(

datachain/lib/listing.py CHANGED Viewed

@@ -113,14 +113,14 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], st
     telemetry.log_param("client", client.PREFIX)
     if not uri.endswith("/") and _isfile(client, uri):
-        return None, f'{storage_uri}/{path.lstrip("/")}', path
+        return None, f"{storage_uri}/{path.lstrip('/')}", path
     if uses_glob(path):
         lst_uri_path = posixpath.dirname(path)
     else:
-        storage_uri, path = Client.parse_url(f'{uri.rstrip("/")}/')
+        storage_uri, path = Client.parse_url(f"{uri.rstrip('/')}/")
         lst_uri_path = path
-    lst_uri = f'{storage_uri}/{lst_uri_path.lstrip("/")}'
+    lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
     ds_name = (
         f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
     )
@@ -180,7 +180,7 @@ def get_listing(
     # for local file system we need to fix listing path / prefix
     # if we are reusing existing listing
     if isinstance(client, FileClient) and listing and listing.name != ds_name:
-        list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
+        list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
     ds_name = listing.name if listing else ds_name

datachain/lib/pytorch.py CHANGED Viewed

@@ -50,6 +50,7 @@ class PytorchDataset(IterableDataset):
         tokenizer_kwargs: Optional[dict[str, Any]] = None,
         num_samples: int = 0,
         dc_settings: Optional[Settings] = None,
+        remove_prefetched: bool = False,
     ):
         """
         Pytorch IterableDataset that streams DataChain datasets.
@@ -84,6 +85,7 @@ class PytorchDataset(IterableDataset):
         self._cache = catalog.cache
         self._prefetch_cache: Optional[Cache] = None
+        self._remove_prefetched = remove_prefetched
         if prefetch and not self.cache:
             tmp_dir = catalog.cache.tmp_dir
             assert tmp_dir
@@ -147,7 +149,7 @@ class PytorchDataset(IterableDataset):
             rows,
             self.prefetch,
             download_cb=download_cb,
-            after_prefetch=download_cb.increment_file_count,
+            remove_prefetched=self._remove_prefetched,
         )
         with download_cb, closing(rows):

datachain 0.8.8__py3-none-any.whl → 0.8.10__py3-none-any.whl

Potentially problematic release.

datachain 0.8.8py3-none-any.whl → 0.8.10py3-none-any.whl