PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/lib/dc/storage.py CHANGED Viewed

@@ -1,20 +1,17 @@
-import os.path
-from typing import (
-    TYPE_CHECKING,
-    Optional,
-    Union,
-)
+import os
+from collections.abc import Sequence
+from functools import reduce
+from typing import TYPE_CHECKING
-from datachain.lib.file import (
-    FileType,
-    get_file_type,
-)
-from datachain.lib.listing import (
-    get_file_info,
-    get_listing,
-    list_bucket,
-    ls,
+from datachain.lib.dc.storage_pattern import (
+    apply_glob_filter,
+    expand_brace_pattern,
+    should_use_recursion,
+    split_uri_pattern,
+    validate_cloud_bucket_name,
 )
+from datachain.lib.file import FileType, get_file_type
+from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
 from datachain.query import Session
 if TYPE_CHECKING:
@@ -22,31 +19,68 @@ if TYPE_CHECKING:
 def read_storage(
-    uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
+    uri: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
     *,
     type: FileType = "binary",
-    session: Optional[Session] = None,
-    settings: Optional[dict] = None,
+    session: Session | None = None,
+    settings: dict | None = None,
     in_memory: bool = False,
-    recursive: Optional[bool] = True,
-    object_name: str = "file",
+    recursive: bool | None = True,
+    column: str = "file",
     update: bool = False,
-    anon: bool = False,
-    client_config: Optional[dict] = None,
+    anon: bool | None = None,
+    delta: bool | None = False,
+    delta_on: str | Sequence[str] | None = (
+        "file.path",
+        "file.etag",
+        "file.version",
+    ),
+    delta_result_on: str | Sequence[str] | None = None,
+    delta_compare: str | Sequence[str] | None = None,
+    delta_retry: bool | str | None = None,
+    delta_unsafe: bool = False,
+    client_config: dict | None = None,
 ) -> "DataChain":
     """Get data from storage(s) as a list of file with all file attributes.
     It returns the chain itself as usual.
     Parameters:
-        uri : storage URI with directory or list of URIs.
-            URIs must start with storage prefix such
-            as `s3://`, `gs://`, `az://` or "file:///"
-        type : read file as "binary", "text", or "image" data. Default is "binary".
-        recursive : search recursively for the given path.
-        object_name : Created object column name.
-        update : force storage reindexing. Default is False.
-        anon : If True, we will treat cloud bucket as public one
-        client_config : Optional client configuration for the storage client.
+        uri: Storage path(s) or URI(s). Can be a local path or start with a
+            storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
+            Supports glob patterns:
+              - `*` : wildcard
+              - `**` : recursive wildcard
+              - `?` : single character
+              - `{a,b}` : brace expansion list
+              - `{1..9}` : brace numeric or alphabetic range
+        type: read file as "binary", "text", or "image" data. Default is "binary".
+        recursive: search recursively for the given path.
+        column: Column name that will contain File objects. Default is "file".
+        update: force storage reindexing. Default is False.
+        anon: If True, we will treat cloud bucket as public one.
+        client_config: Optional client configuration for the storage client.
+        delta: If True, only process new or changed files instead of reprocessing
+            everything. This saves time by skipping files that were already processed in
+            previous versions. The optimization is working when a new version of the
+            dataset is created.
+            Default is False.
+        delta_on: Field(s) that uniquely identify each record in the source data.
+            Used to detect which records are new or changed.
+            Default is ("file.path", "file.etag", "file.version").
+        delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
+            Only needed if you rename the identifying fields during processing.
+            Default is None.
+        delta_compare: Field(s) used to detect if a record has changed.
+            If not specified, all fields except `delta_on` fields are used.
+            Default is None.
+        delta_retry: Controls retry behavior for failed records:
+            - String (field name): Reprocess records where this field is not empty
+              (error mode)
+            - True: Reprocess records missing from the result dataset (missing mode)
+            - None: No retry processing (default)
+        delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
+            distinct. Caller must ensure datasets are consistent and not partially
+            updated.
     Returns:
         DataChain: A DataChain object containing the file information.
@@ -55,37 +89,36 @@ def read_storage(
         Simple call from s3:
         ```python
         import datachain as dc
-        chain = dc.read_storage("s3://my-bucket/my-dir")
+        dc.read_storage("s3://my-bucket/my-dir")
+        ```
+        Match all .json files recursively using glob pattern
+        ```py
+        dc.read_storage("gs://bucket/meta/**/*.json")
+        ```
+        Match image file extensions for directories with pattern
+        ```py
+        dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
+        ```
+        By ranges in filenames:
+        ```py
+        dc.read_storage("s3://bucket/202{1..4}/**/*.{jpg,jpeg,png}")
         ```
         Multiple URIs:
         ```python
-        chain = dc.read_storage([
-            "s3://bucket1/dir1",
-            "s3://bucket2/dir2"
-        ])
+        dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
         ```
         With AWS S3-compatible storage:
         ```python
-        chain = dc.read_storage(
+        dc.read_storage(
             "s3://my-bucket/my-dir",
             client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
         )
         ```
-        Pass existing session
-        ```py
-        session = Session.get()
-        chain = dc.read_storage([
-            "path/to/dir1",
-            "path/to/dir2"
-        ], session=session, recursive=True)
-        ```
-    Note:
-        When using multiple URIs with `update=True`, the function optimizes by
-        avoiding redundant updates for URIs pointing to the same storage location.
     """
     from .datachain import DataChain
     from .datasets import read_dataset
@@ -94,24 +127,50 @@ def read_storage(
     file_type = get_file_type(type)
-    if anon:
-        client_config = (client_config or {}) | {"anon": True}
+    if anon is not None:
+        client_config = (client_config or {}) | {"anon": anon}
     session = Session.get(session, client_config=client_config, in_memory=in_memory)
-    cache = session.catalog.cache
+    catalog = session.catalog
+    cache = catalog.cache
     client_config = session.catalog.client_config
+    listing_namespace_name = catalog.metastore.system_namespace_name
+    listing_project_name = catalog.metastore.listing_project_name
     uris = uri if isinstance(uri, (list, tuple)) else [uri]
     if not uris:
         raise ValueError("No URIs provided")
-    storage_chain = None
+    # Then expand all URIs that contain brace patterns
+    expanded_uris = []
+    for single_uri in uris:
+        uri_str = str(single_uri)
+        validate_cloud_bucket_name(uri_str)
+        expanded_uris.extend(expand_brace_pattern(uri_str))
+    # Now process each expanded URI
+    chains = []
     listed_ds_name = set()
     file_values = []
-    for single_uri in uris:
+    updated_uris = set()
+    for single_uri in expanded_uris:
+        # Check if URI contains glob patterns and split them
+        base_uri, glob_pattern = split_uri_pattern(single_uri)
+        # If a pattern is found, use the base_uri for listing
+        # The pattern will be used for filtering later
+        list_uri_to_use = base_uri if glob_pattern else single_uri
+        # Avoid double updates for the same URI
+        update_single_uri = False
+        if update and (list_uri_to_use not in updated_uris):
+            updated_uris.add(list_uri_to_use)
+            update_single_uri = True
         list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
-            single_uri, session, update=update
+            list_uri_to_use, session, update=update_single_uri
         )
         # list_ds_name is None if object is a file, we don't want to use cache
@@ -122,9 +181,21 @@ def read_storage(
             )
             continue
-        dc = read_dataset(list_ds_name, session=session, settings=settings)
+        dc = read_dataset(
+            list_ds_name,
+            namespace=listing_namespace_name,
+            project=listing_project_name,
+            session=session,
+            settings=settings,
+            delta=delta,
+            delta_on=delta_on,
+            delta_result_on=delta_result_on,
+            delta_compare=delta_compare,
+            delta_retry=delta_retry,
+            delta_unsafe=delta_unsafe,
+        )
         dc._query.update = update
-        dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+        dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
         if update or not list_ds_exists:
@@ -137,23 +208,42 @@ def read_storage(
                         settings=settings,
                         in_memory=in_memory,
                     )
-                    .settings(prefetch=0)
+                    .settings(
+                        prefetch=0,
+                        namespace=listing_namespace_name,
+                        project=listing_project_name,
+                    )
                     .gen(
                         list_bucket(lst_uri, cache, client_config=client_config),
-                        output={f"{object_name}": file_type},
+                        output={f"{column}": file_type},
                     )
-                    .save(ds_name, listing=True)
+                    # for internal listing datasets, we always bump major version
+                    .save(ds_name, listing=True, update_version="major")
                 )
             dc._query.set_listing_fn(
                 lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
             )
-        chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
+        # If a glob pattern was detected, use it for filtering
+        # Otherwise, use the original list_path from get_listing
+        if glob_pattern:
+            # Determine if we should use recursive listing based on the pattern
+            use_recursive = should_use_recursion(glob_pattern, recursive or False)
+            # Apply glob filter - no need for brace expansion here as it's done above
+            chain = apply_glob_filter(
+                dc, glob_pattern, list_path, use_recursive, column
+            )
+            chains.append(chain)
+        else:
+            # No glob pattern detected, use normal ls behavior
+            chains.append(ls(dc, list_path, recursive=recursive, column=column))
-        storage_chain = storage_chain.union(chain) if storage_chain else chain
         listed_ds_name.add(list_ds_name)
+    storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
     if file_values:
         file_chain = read_values(
             session=session,
@@ -162,7 +252,7 @@ def read_storage(
             file=file_values,
         )
         file_chain.signals_schema = file_chain.signals_schema.mutate(
-            {f"{object_name}": file_type}
+            {f"{column}": file_type}
         )
         storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain

datachain/lib/dc/storage_pattern.py ADDED Viewed

@@ -0,0 +1,251 @@
+import glob
+from typing import TYPE_CHECKING
+from datachain.client.fsspec import is_cloud_uri
+from datachain.lib.listing import ls
+if TYPE_CHECKING:
+    from .datachain import DataChain
+def validate_cloud_bucket_name(uri: str) -> None:
+    """
+    Validate that cloud storage bucket names don't contain glob patterns.
+    Raises:
+        ValueError: If a cloud storage bucket name contains glob patterns
+    """
+    if not is_cloud_uri(uri):
+        return
+    if "://" in uri:
+        scheme_end = uri.index("://") + 3
+        path_part = uri[scheme_end:]
+        if "/" in path_part:
+            bucket_name = path_part.split("/")[0]
+        else:
+            bucket_name = path_part
+        glob_chars = ["*", "?", "[", "]", "{", "}"]
+        if any(char in bucket_name for char in glob_chars):
+            raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
+def split_uri_pattern(uri: str) -> tuple[str, str | None]:
+    """Split a URI into base path and glob pattern."""
+    if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
+        return uri, None
+    if "://" in uri:
+        scheme_end = uri.index("://") + 3
+        scheme_part = uri[:scheme_end]
+        path_part = uri[scheme_end:]
+        path_segments = path_part.split("/")
+        pattern_start_idx = None
+        for i, segment in enumerate(path_segments):
+            # Check for glob patterns including brace expansion
+            if glob.has_magic(segment) or "{" in segment:
+                pattern_start_idx = i
+                break
+        if pattern_start_idx is None:
+            return uri, None
+        if pattern_start_idx == 0:
+            base = scheme_part + path_segments[0]
+            pattern = "/".join(path_segments[1:]) if len(path_segments) > 1 else "*"
+        else:
+            base = scheme_part + "/".join(path_segments[:pattern_start_idx])
+            pattern = "/".join(path_segments[pattern_start_idx:])
+        return base, pattern
+    path_segments = uri.split("/")
+    pattern_start_idx = None
+    for i, segment in enumerate(path_segments):
+        if glob.has_magic(segment) or "{" in segment:
+            pattern_start_idx = i
+            break
+    if pattern_start_idx is None:
+        return uri, None
+    base = "/".join(path_segments[:pattern_start_idx]) if pattern_start_idx > 0 else "/"
+    pattern = "/".join(path_segments[pattern_start_idx:])
+    return base, pattern
+def should_use_recursion(pattern: str, user_recursive: bool) -> bool:
+    if not user_recursive:
+        return False
+    if "**" in pattern:
+        return True
+    return "/" in pattern
+def expand_brace_pattern(pattern: str) -> list[str]:
+    """
+    Recursively expand brace patterns into multiple glob patterns.
+    Supports:
+    - Comma-separated lists: *.{mp3,wav}
+    - Numeric ranges: file{1..10}
+    - Zero-padded numeric ranges: file{01..10}
+    - Character ranges: file{a..z}
+    Examples:
+        "*.{mp3,wav}" -> ["*.mp3", "*.wav"]
+        "file{1..3}" -> ["file1", "file2", "file3"]
+        "file{01..03}" -> ["file01", "file02", "file03"]
+        "file{a..c}" -> ["filea", "fileb", "filec"]
+        "{a,b}/{c,d}" -> ["a/c", "a/d", "b/c", "b/d"]
+    """
+    if "{" not in pattern or "}" not in pattern:
+        return [pattern]
+    return _expand_single_braces(pattern)
+def _expand_single_braces(pattern: str) -> list[str]:
+    if "{" not in pattern or "}" not in pattern:
+        return [pattern]
+    start = pattern.index("{")
+    end = start
+    depth = 0
+    for i in range(start, len(pattern)):
+        if pattern[i] == "{":
+            depth += 1
+        elif pattern[i] == "}":
+            depth -= 1
+            if depth == 0:
+                end = i
+                break
+    if start >= end:
+        return [pattern]
+    prefix = pattern[:start]
+    suffix = pattern[end + 1 :]
+    brace_content = pattern[start + 1 : end]
+    if ".." in brace_content:
+        options = _expand_range(brace_content)
+    else:
+        options = [opt.strip() for opt in brace_content.split(",")]
+    expanded = []
+    for option in options:
+        combined = prefix + option + suffix
+        expanded.extend(_expand_single_braces(combined))
+    return expanded
+def _expand_range(range_spec: str) -> list[str]:  # noqa: PLR0911
+    if ".." not in range_spec:
+        return [range_spec]
+    parts = range_spec.split("..")
+    if len(parts) != 2:
+        return [range_spec]
+    start, end = parts[0], parts[1]
+    if start.isdigit() and end.isdigit():
+        pad_width = max(len(start), len(end)) if start[0] == "0" or end[0] == "0" else 0
+        start_num = int(start)
+        end_num = int(end)
+        if start_num <= end_num:
+            if pad_width > 0:
+                return [str(i).zfill(pad_width) for i in range(start_num, end_num + 1)]
+            return [str(i) for i in range(start_num, end_num + 1)]
+        if pad_width > 0:
+            return [str(i).zfill(pad_width) for i in range(start_num, end_num - 1, -1)]
+        return [str(i) for i in range(start_num, end_num - 1, -1)]
+    if len(start) == 1 and len(end) == 1 and start.isalpha() and end.isalpha():
+        start_ord = ord(start)
+        end_ord = ord(end)
+        if start_ord <= end_ord:
+            return [chr(i) for i in range(start_ord, end_ord + 1)]
+        return [chr(i) for i in range(start_ord, end_ord - 1, -1)]
+    return [range_spec]
+def convert_globstar_to_glob(filter_pattern: str) -> str:
+    if "**" not in filter_pattern:
+        return filter_pattern
+    parts = filter_pattern.split("/")
+    globstar_positions = [i for i, p in enumerate(parts) if p == "**"]
+    num_globstars = len(globstar_positions)
+    if num_globstars <= 1:
+        if filter_pattern == "**/*":
+            return "*"
+        if filter_pattern.startswith("**/"):
+            remaining = filter_pattern[3:]
+            if "/" not in remaining:
+                # Pattern like **/*.ext or **/temp?.*
+                # The ** means zero or more directories
+                # For zero directories: pattern should be just the filename pattern
+                # For one or more: pattern should be */filename
+                # Since we can't OR in GLOB, we choose the more permissive option
+                # that works with recursive listing
+                # Special handling: if it's a simple extension pattern, match broadly
+                if remaining.startswith("*."):
+                    return remaining
+                return f"*/{remaining}"
+        return filter_pattern.replace("**", "*")
+    middle_parts = []
+    start_idx = globstar_positions[0] + 1
+    end_idx = globstar_positions[-1]
+    for i in range(start_idx, end_idx):
+        if parts[i] != "**":
+            middle_parts.append(parts[i])
+    if not middle_parts:
+        result = filter_pattern.replace("**", "*")
+    else:
+        middle_pattern = "/".join(middle_parts)
+        last_part = parts[-1] if parts[-1] != "**" else "*"
+        if last_part != "*":
+            result = f"*{middle_pattern}*{last_part}"
+        else:
+            result = f"*{middle_pattern}*"
+    return result
+def apply_glob_filter(
+    dc: "DataChain",
+    pattern: str,
+    list_path: str,
+    use_recursive: bool,
+    column: str,
+) -> "DataChain":
+    from datachain.query.schema import Column
+    chain = ls(dc, list_path, recursive=use_recursive, column=column)
+    if list_path and "/" not in pattern:
+        filter_pattern = f"{list_path.rstrip('/')}/{pattern}"
+    else:
+        filter_pattern = pattern
+    glob_pattern = convert_globstar_to_glob(filter_pattern)
+    return chain.filter(Column(f"{column}.path").glob(glob_pattern))

datachain/lib/dc/utils.py CHANGED Viewed

@@ -1,12 +1,6 @@
 from collections.abc import Sequence
 from functools import wraps
-from typing import (
-    TYPE_CHECKING,
-    Callable,
-    Optional,
-    TypeVar,
-    Union,
-)
+from typing import TYPE_CHECKING, TypeVar
 import sqlalchemy
 from sqlalchemy.sql.functions import GenericFunction
@@ -15,9 +9,13 @@ from datachain.func.base import Function
 from datachain.lib.data_model import DataModel, DataType
 from datachain.lib.utils import DataChainParamsError
 from datachain.query.schema import DEFAULT_DELIMITER
+from datachain.utils import getenv_bool
 if TYPE_CHECKING:
-    from typing_extensions import Concatenate, ParamSpec
+    from collections.abc import Callable
+    from typing import Concatenate
+    from typing_extensions import ParamSpec
     from .datachain import DataChain
@@ -26,13 +24,23 @@ if TYPE_CHECKING:
 D = TypeVar("D", bound="DataChain")
+def is_studio() -> bool:
+    """Check if the runtime environment is Studio (not local)."""
+    return getenv_bool("DATACHAIN_IS_STUDIO", default=False)
+def is_local() -> bool:
+    """Check if the runtime environment is local (not Studio)."""
+    return not is_studio()
 def resolve_columns(
     method: "Callable[Concatenate[D, P], D]",
 ) -> "Callable[Concatenate[D, P], D]":
     """Decorator that resolvs input column names to their actual DB names. This is
     specially important for nested columns as user works with them by using dot
-    notation e.g (file.name) but are actually defined with default delimiter
-    in DB, e.g file__name.
+    notation e.g (file.path) but are actually defined with default delimiter
+    in DB, e.g file__path.
     If there are any sql functions in arguments, they will just be transferred as is
     to a method.
     """
@@ -65,11 +73,11 @@ class DatasetFromValuesError(DataChainParamsError):
         super().__init__(f"Dataset{name} from values error: {msg}")
-MergeColType = Union[str, Function, sqlalchemy.ColumnElement]
+MergeColType = str | Function | sqlalchemy.ColumnElement
 def _validate_merge_on(
-    on: Union[MergeColType, Sequence[MergeColType]],
+    on: MergeColType | Sequence[MergeColType],
     ds: "DataChain",
 ) -> Sequence[MergeColType]:
     if isinstance(on, (str, sqlalchemy.ColumnElement)):
@@ -98,12 +106,12 @@ def _get_merge_error_str(col: MergeColType) -> str:
 class DatasetMergeError(DataChainParamsError):
     def __init__(
         self,
-        on: Union[MergeColType, Sequence[MergeColType]],
-        right_on: Optional[Union[MergeColType, Sequence[MergeColType]]],
+        on: MergeColType | Sequence[MergeColType],
+        right_on: MergeColType | Sequence[MergeColType] | None,
         msg: str,
     ):
         def _get_str(
-            on: Union[MergeColType, Sequence[MergeColType]],
+            on: MergeColType | Sequence[MergeColType],
         ) -> str:
             if not isinstance(on, Sequence):
                 return str(on)  # type: ignore[unreachable]
@@ -118,7 +126,7 @@ class DatasetMergeError(DataChainParamsError):
         super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
-OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
+OutputType = DataType | Sequence[str] | dict[str, DataType] | None
 class Sys(DataModel):

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl