PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/cli/commands/ls.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import shlex
 from collections.abc import Iterable, Iterator
 from itertools import chain
-from typing import TYPE_CHECKING, Optional
-if TYPE_CHECKING:
-    from datachain.catalog import Catalog
+from typing import TYPE_CHECKING
 from datachain.cli.utils import determine_flavors
 from datachain.config import Config
+from datachain.query.session import Session
+if TYPE_CHECKING:
+    from datachain.catalog import Catalog
 def ls(
@@ -16,7 +17,7 @@ def ls(
     studio: bool = False,
     local: bool = False,
     all: bool = True,
-    team: Optional[str] = None,
+    team: str | None = None,
     **kwargs,
 ):
     token = Config().read().get("studio", {}).get("token")
@@ -32,18 +33,15 @@ def ls(
 def ls_local(
     sources,
     long: bool = False,
-    catalog: Optional["Catalog"] = None,
+    catalog=None,
     client_config=None,
     **kwargs,
 ):
     from datachain import listings
     if sources:
-        if catalog is None:
-            from datachain.catalog import get_catalog
-            catalog = get_catalog(client_config=client_config)
+        session = Session.get(catalog=catalog, client_config=client_config)
+        catalog = session.catalog
         actual_sources = list(ls_urls(sources, catalog=catalog, long=long, **kwargs))
         if len(actual_sources) == 1:
             for _, entries in actual_sources:
@@ -63,8 +61,8 @@ def ls_local(
                     print(format_ls_entry(entry))
     else:
         # Collect results in a list here to prevent interference from `tqdm` and `print`
-        listing = list(listings().collect("listing"))
-        for ls in listing:
+        listing = listings().to_list("listing")
+        for (ls,) in listing:
             print(format_ls_entry(f"{ls.uri}@v{ls.version}"))  # type: ignore[union-attr]
@@ -78,7 +76,7 @@ def format_ls_entry(entry: str) -> str:
 def ls_remote(
     paths: Iterable[str],
     long: bool = False,
-    team: Optional[str] = None,
+    team: str | None = None,
 ):
     from datachain.node import long_line_str
     from datachain.remote.studio import StudioClient
@@ -145,7 +143,7 @@ def _ls_urls_flat(
     long: bool,
     catalog: "Catalog",
     **kwargs,
-) -> Iterator[tuple[str, Iterator[str]]]:
+) -> Iterator[tuple[str, Iterable[str]]]:
     from datachain.client import Client
     from datachain.node import long_line_str
@@ -154,7 +152,9 @@ def _ls_urls_flat(
         if client_cls.is_root_url(source):
             buckets = client_cls.ls_buckets(**catalog.client_config)
             if long:
-                values = (long_line_str(b.name, b.created) for b in buckets)
+                values: Iterable[str] = (
+                    long_line_str(b.name, b.created) for b in buckets
+                )
             else:
                 values = (b.name for b in buckets)
             yield source, values
@@ -164,7 +164,7 @@ def _ls_urls_flat(
             if long:
                 fields.append("last_modified")
             for data_source, results in catalog.ls([source], fields=fields, **kwargs):
-                values = (_node_data_to_ls_values(r, long) for r in results)
+                values = [_node_data_to_ls_values(r, long) for r in results]
                 found = True
                 yield data_source.dirname(), values
             if not found:

datachain/cli/commands/show.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 from datachain.lib.signal_schema import SignalSchema
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
 def show(
     catalog: "Catalog",
     name: str,
-    version: Optional[int] = None,
+    version: str | None = None,
     limit: int = 10,
     offset: int = 0,
     columns: Sequence[str] = (),
@@ -42,8 +42,8 @@ def show(
     print("Name: ", name)
     if dataset.description:
         print("Description: ", dataset.description)
-    if dataset.labels:
-        print("Labels: ", ",".join(dataset.labels))
+    if dataset.attrs:
+        print("Attributes: ", ",".join(dataset.attrs))
     print("\n")
     show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)

datachain/cli/parser/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ from importlib.metadata import PackageNotFoundError, version
 import shtab
-from datachain.cli.utils import BooleanOptionalAction, KeyValueArgs
+from datachain.cli.utils import BooleanOptionalAction
 from .job import add_jobs_parser
 from .studio import add_auth_parser
@@ -16,9 +16,7 @@ from .utils import (
     add_update_arg,
     find_columns_type,
 )
-from .utils import (
-    CustomArgumentParser as ArgumentParser,
-)
+from .utils import CustomArgumentParser as ArgumentParser
 def get_parser() -> ArgumentParser:  # noqa: PLR0915
@@ -217,29 +215,9 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Dataset description",
     )
     parse_edit_dataset.add_argument(
-        "--labels",
+        "--attrs",
         nargs="+",
-        help="Dataset labels",
-    )
-    parse_edit_dataset.add_argument(
-        "--studio",
-        action="store_true",
-        default=False,
-        help="Edit dataset from Studio",
-    )
-    parse_edit_dataset.add_argument(
-        "-L",
-        "--local",
-        action="store_true",
-        default=False,
-        help="Edit local dataset only",
-    )
-    parse_edit_dataset.add_argument(
-        "-a",
-        "--all",
-        action="store_true",
-        default=True,
-        help="Edit both datasets from studio and local",
+        help="Dataset attributes",
     )
     parse_edit_dataset.add_argument(
         "--team",
@@ -302,7 +280,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "--version",
         action="store",
         default=None,
-        type=int,
+        type=str,
         help="Dataset version",
     )
     rm_dataset_parser.add_argument(
@@ -315,21 +293,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "--studio",
         action="store_true",
         default=False,
-        help="Remove dataset from Studio",
-    )
-    rm_dataset_parser.add_argument(
-        "-L",
-        "--local",
-        action="store_true",
-        default=False,
-        help="Remove local datasets only",
-    )
-    rm_dataset_parser.add_argument(
-        "-a",
-        "--all",
-        action="store_true",
-        default=True,
-        help="Remove both local and studio",
+        help="Remove dataset from Studio only",
     )
     rm_dataset_parser.add_argument(
         "--team",
@@ -495,43 +459,12 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "--version",
         action="store",
         default=None,
-        type=int,
+        type=str,
         help="Dataset version",
     )
     show_parser.add_argument("--schema", action="store_true", help="Show schema")
     add_show_args(show_parser)
-    query_parser = subp.add_parser(
-        "query",
-        parents=[parent_parser],
-        description="Create a new dataset with a query script.",
-        formatter_class=CustomHelpFormatter,
-    )
-    add_anon_arg(query_parser)
-    query_parser.add_argument(
-        "script", metavar="<script.py>", type=str, help="Filepath for script"
-    )
-    query_parser.add_argument(
-        "--parallel",
-        nargs="?",
-        type=int,
-        const=-1,
-        default=None,
-        metavar="N",
-        help=(
-            "Use multiprocessing to run any query script UDFs with N worker processes. "
-            "N defaults to the CPU count"
-        ),
-    )
-    query_parser.add_argument(
-        "-p",
-        "--param",
-        metavar="param=value",
-        nargs=1,
-        action=KeyValueArgs,
-        help="Query parameters",
-    )
     parse_clear_cache = subp.add_parser(
         "clear-cache",
         parents=[parent_parser],
@@ -550,6 +483,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     subp.add_parser("internal-run-udf", parents=[parent_parser])
     subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
     add_completion_parser(subp, [parent_parser])
     return parser

datachain/cli/parser/job.py CHANGED Viewed

@@ -13,11 +13,16 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
     )
     jobs_subparser = jobs_parser.add_subparsers(
         dest="cmd",
-        help="Use `datachain auth CMD --help` to display command-specific help",
+        help="Use `datachain job CMD --help` to display command-specific help",
     )
     studio_run_help = "Run a job in Studio"
-    studio_run_description = "Run a job in Studio."
+    studio_run_description = "Run a job in Studio. \n"
+    studio_run_description += (
+        "When using --start-time or --cron,"
+        " the job is scheduled to run but won't start immediately"
+        " (can be seen in the Tasks tab in UI)"
+    )
     studio_run_parser = jobs_subparser.add_parser(
         "run",
@@ -51,6 +56,20 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         help="Environment variables in KEY=VALUE format",
     )
+    studio_run_parser.add_argument(
+        "--cluster",
+        type=str,
+        action="store",
+        help="Compute cluster to run the job on",
+    )
+    studio_run_parser.add_argument(
+        "-c",
+        "--credentials-name",
+        action="store",
+        help="Name of the credentials to use for the job",
+    )
     studio_run_parser.add_argument(
         "--workers",
         type=int,
@@ -64,7 +83,12 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
     studio_run_parser.add_argument(
         "--python-version",
         action="store",
-        help="Python version for the job (e.g., 3.9, 3.10, 3.11)",
+        help="Python version for the job (e.g., 3.10, 3.11, 3.12, 3.13)",
+    )
+    studio_run_parser.add_argument(
+        "--repository",
+        action="store",
+        help="Repository URL to clone before running the job",
     )
     studio_run_parser.add_argument(
         "--req-file",
@@ -77,6 +101,56 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         nargs="+",
         help="Python package requirements",
     )
+    studio_run_parser.add_argument(
+        "--priority",
+        type=int,
+        default=5,
+        help="Priority for the job in range 0-5. "
+        "Lower value is higher priority (default: 5)",
+    )
+    studio_run_parser.add_argument(
+        "--start-time",
+        action="store",
+        help="Time to schedule a task in YYYY-MM-DDTHH:mm format or natural language.",
+    )
+    studio_run_parser.add_argument(
+        "--cron", action="store", help="Cron expression for the cron task."
+    )
+    studio_run_parser.add_argument(
+        "--no-wait",
+        action="store_true",
+        help="Do not wait for the job to finish",
+    )
+    studio_ls_help = "List jobs in Studio"
+    studio_ls_description = "List jobs in Studio."
+    studio_ls_parser = jobs_subparser.add_parser(
+        "ls",
+        parents=[parent_parser],
+        description=studio_ls_description,
+        help=studio_ls_help,
+        formatter_class=CustomHelpFormatter,
+    )
+    studio_ls_parser.add_argument(
+        "--status",
+        action="store",
+        help="Status to filter jobs by",
+    )
+    studio_ls_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="Team to list jobs for (default: from config)",
+    )
+    studio_ls_parser.add_argument(
+        "--limit",
+        type=int,
+        default=20,
+        help="Limit the number of jobs returned (default: 20)",
+    )
     studio_cancel_help = "Cancel a job in Studio"
     studio_cancel_description = "Cancel a running job in Studio."
@@ -123,3 +197,21 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         default=None,
         help="Team to check logs for (default: from config)",
     )
+    studio_clusters_help = "List compute clusters in Studio"
+    studio_clusters_description = "List compute clusters in Studio."
+    studio_clusters_parser = jobs_subparser.add_parser(
+        "clusters",
+        parents=[parent_parser],
+        description=studio_clusters_description,
+        help=studio_clusters_help,
+        formatter_class=CustomHelpFormatter,
+    )
+    studio_clusters_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="Team to list clusters for (default: from config)",
+    )

datachain/cli/parser/studio.py CHANGED Viewed

@@ -89,8 +89,13 @@ def add_auth_parser(subparsers, parent_parser) -> None:
         help="Remove the token from the local project config",
     )
-    auth_team_help = "Set default team for Studio operations"
-    auth_team_description = "Set the default team for Studio operations."
+    auth_team_help = "Set or show default team for Studio operations"
+    auth_team_description = (
+        "Set or show the default team for Studio operations. "
+        "This will be used globally by default. "
+        "Use --local to set the team locally for the current project. "
+        "If no team name is provided, the default team will be shown."
+    )
     team_parser = auth_subparser.add_parser(
         "team",
@@ -102,13 +107,15 @@ def add_auth_parser(subparsers, parent_parser) -> None:
     team_parser.add_argument(
         "team_name",
         action="store",
+        default=None,
+        nargs="?",
         help="Name of the team to set as default",
     )
     team_parser.add_argument(
-        "--global",
+        "--local",
         action="store_true",
         default=False,
-        help="Set team globally for all projects",
+        help="Set team locally for the current project",
     )
     auth_token_help = "View Studio authentication token"  # noqa: S105

datachain/cli/parser/utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from argparse import Action, ArgumentParser, ArgumentTypeError, HelpFormatter
-from typing import Union
 from datachain.cli.utils import CommaSeparatedArgs
@@ -44,7 +43,7 @@ def parse_find_column(column: str) -> str:
     )
-def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Action:
+def add_sources_arg(parser: ArgumentParser, nargs: str | int = "+") -> Action:
     return parser.add_argument(
         "sources",
         type=str,

datachain/cli/utils.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
-from argparse import SUPPRESS, Action, ArgumentError, Namespace, _AppendAction
-from typing import Optional
+from argparse import SUPPRESS, Action, Namespace, _AppendAction
 from datachain.error import DataChainError
@@ -64,18 +63,6 @@ class CommaSeparatedArgs(_AppendAction):  # pylint: disable=protected-access
         setattr(namespace, self.dest, list(dict.fromkeys(items)))
-class KeyValueArgs(_AppendAction):  # pylint: disable=protected-access
-    def __call__(self, parser, namespace, values, option_string=None):
-        items = getattr(namespace, self.dest) or {}
-        for raw_value in filter(bool, values):
-            key, sep, value = raw_value.partition("=")
-            if not key or not sep or value == "":
-                raise ArgumentError(self, f"expected 'key=value', got {raw_value!r}")
-            items[key.strip()] = value
-        setattr(namespace, self.dest, items)
 def get_logging_level(args: Namespace) -> int:
     if args.quiet:
         return logging.CRITICAL
@@ -84,7 +71,7 @@ def get_logging_level(args: Namespace) -> int:
     return logging.INFO
-def determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
+def determine_flavors(studio: bool, local: bool, all: bool, token: str | None):
     if studio and not token:
         raise DataChainError(
             "Not logged in to Studio. Log in with 'datachain auth login'."

datachain/client/azure.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 from urllib.parse import parse_qs, urlsplit, urlunsplit
 from adlfs import AzureBlobFileSystem
@@ -15,7 +15,7 @@ class AzureClient(Client):
     protocol = "az"
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
-        version_id = v.get("version_id")
+        version_id = v.get("version_id") if self._is_version_aware() else None
         return File(
             source=self.uri,
             path=path,
@@ -65,7 +65,7 @@ class AzureClient(Client):
                         if entries:
                             await result_queue.put(entries)
                             pbar.update(len(entries))
-                    if not found:
+                    if not found and prefix:
                         raise FileNotFoundError(
                             f"Unable to resolve remote path: {prefix}"
                         )
@@ -73,7 +73,7 @@ class AzureClient(Client):
             result_queue.put_nowait(None)
     @classmethod
-    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+    def version_path(cls, path: str, version_id: str | None) -> str:
         parts = list(urlsplit(path))
         query = parse_qs(parts[3])
         if "versionid" in query:

datachain/client/fsspec.py CHANGED Viewed

@@ -10,15 +10,7 @@ from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator, Iterator, Sequence
 from datetime import datetime
 from shutil import copy2
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    BinaryIO,
-    ClassVar,
-    NamedTuple,
-    Optional,
-    Union,
-)
+from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, NamedTuple
 from urllib.parse import urlparse
 from dvc_objects.fs.system import reflink
@@ -44,11 +36,12 @@ FETCH_WORKERS = 100
 DELIMITER = "/"  # Path delimiter.
 DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
+CLOUD_STORAGE_PROTOCOLS = {"s3", "gs", "az", "hf"}
-ResultQueue = asyncio.Queue[Optional[Sequence["File"]]]
+ResultQueue = asyncio.Queue[Sequence["File"] | None]
-def _is_win_local_path(uri: str) -> bool:
+def is_win_local_path(uri: str) -> bool:
     if sys.platform == "win32":
         if len(uri) >= 1 and uri[0] == "\\":
             return True
@@ -62,10 +55,20 @@ def _is_win_local_path(uri: str) -> bool:
     return False
+def is_cloud_uri(uri: str) -> bool:
+    protocol = urlparse(uri).scheme
+    return protocol in CLOUD_STORAGE_PROTOCOLS
+def get_cloud_schemes() -> list[str]:
+    """Get list of cloud storage scheme prefixes."""
+    return [f"{p}://" for p in CLOUD_STORAGE_PROTOCOLS]
 class Bucket(NamedTuple):
     name: str
     uri: "StorageURI"
-    created: Optional[datetime]
+    created: datetime | None
 class Client(ABC):
@@ -77,21 +80,22 @@ class Client(ABC):
     def __init__(self, name: str, fs_kwargs: dict[str, Any], cache: Cache) -> None:
         self.name = name
         self.fs_kwargs = fs_kwargs
-        self._fs: Optional[AbstractFileSystem] = None
+        self._fs: AbstractFileSystem | None = None
         self.cache = cache
         self.uri = self.get_uri(self.name)
     @staticmethod
-    def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
+    def get_implementation(url: str | os.PathLike[str]) -> type["Client"]:  # noqa: PLR0911
         from .azure import AzureClient
         from .gcs import GCSClient
         from .hf import HfClient
+        from .http import HTTPClient, HTTPSClient
         from .local import FileClient
         from .s3 import ClientS3
         protocol = urlparse(os.fspath(url)).scheme
-        if not protocol or _is_win_local_path(os.fspath(url)):
+        if not protocol or is_win_local_path(os.fspath(url)):
             return FileClient
         if protocol == ClientS3.protocol:
             return ClientS3
@@ -103,9 +107,18 @@ class Client(ABC):
             return FileClient
         if protocol == HfClient.protocol:
             return HfClient
+        if protocol == HTTPClient.protocol:
+            return HTTPClient
+        if protocol == HTTPSClient.protocol:
+            return HTTPSClient
         raise NotImplementedError(f"Unsupported protocol: {protocol}")
+    @classmethod
+    def path_to_uri(cls, path: str) -> str:
+        """Convert a path-like object to a URI. Default: identity."""
+        return path
     @staticmethod
     def is_data_source_uri(name: str) -> bool:
         # Returns True if name is one of supported data sources URIs, e.g s3 bucket
@@ -118,9 +131,7 @@ class Client(ABC):
         return cls.get_uri(storage_name), rel_path
     @staticmethod
-    def get_client(
-        source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
-    ) -> "Client":
+    def get_client(source: str | os.PathLike[str], cache: Cache, **kwargs) -> "Client":
         cls = Client.get_implementation(source)
         storage_url, _ = cls.split_url(os.fspath(source))
         if os.name == "nt":
@@ -136,7 +147,7 @@ class Client(ABC):
         return fs
     @classmethod
-    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+    def version_path(cls, path: str, version_id: str | None) -> str:
         return path
     @classmethod
@@ -207,24 +218,25 @@ class Client(ABC):
         )
     async def get_current_etag(self, file: "File") -> str:
+        file_path = file.get_path_normalized()
         kwargs = {}
-        if getattr(self.fs, "version_aware", False):
+        if self._is_version_aware():
             kwargs["version_id"] = file.version
         info = await self.fs._info(
-            self.get_full_path(file.path, file.version), **kwargs
+            self.get_full_path(file_path, file.version), **kwargs
         )
-        return self.info_to_file(info, file.path).etag
+        return self.info_to_file(info, file_path).etag
-    def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
+    def get_file_info(self, path: str, version_id: str | None = None) -> "File":
         info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
         return self.info_to_file(info, path)
-    async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
+    async def get_size(self, path: str, version_id: str | None = None) -> int:
         return await self.fs._size(
             self.version_path(path, version_id), version_id=version_id
         )
-    async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
+    async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
         return await self.fs._get_file(
             self.version_path(lpath, version_id),
             rpath,
@@ -326,15 +338,19 @@ class Client(ABC):
         """
         return not (key.startswith("/") or key.endswith("/") or "//" in key)
+    def _is_version_aware(self) -> bool:
+        return getattr(self.fs, "version_aware", False)
     async def ls_dir(self, path):
-        if getattr(self.fs, "version_aware", False):
+        kwargs = {}
+        if self._is_version_aware():
             kwargs = {"versions": True}
         return await self.fs._ls(path, detail=True, **kwargs)
     def rel_path(self, path: str) -> str:
         return self.fs.split_path(path)[1]
-    def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
+    def get_full_path(self, rel_path: str, version_id: str | None = None) -> str:
         return self.version_path(f"{self.PREFIX}{self.name}/{rel_path}", version_id)
     @abstractmethod
@@ -382,7 +398,8 @@ class Client(ABC):
             return open(cache_path, mode="rb")
         assert not file.location
         return FileWrapper(
-            self.fs.open(self.get_full_path(file.path, file.version)), cb
+            self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
+            cb,
         )  # type: ignore[return-value]
     def upload(self, data: bytes, path: str) -> "File":

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl