PyPI - datachain - Versions diffs - 0.26.3__py3-none-any.whl → 0.27.0__py3-none-any.whl - Mend

datachain 0.26.3py3-none-any.whl → 0.27.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (18) hide show

datachain/cli/parser/job.py +14 -1
datachain/lib/arrow.py +1 -1
datachain/lib/audio.py +123 -30
datachain/lib/data_model.py +9 -1
datachain/lib/dc/datachain.py +8 -4
datachain/lib/dc/hf.py +20 -4
datachain/lib/dc/storage.py +3 -3
datachain/lib/file.py +60 -8
datachain/lib/hf.py +17 -7
datachain/lib/video.py +4 -1
datachain/remote/studio.py +4 -0
datachain/studio.py +36 -0
{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/METADATA +7 -2
{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/RECORD +18 -18
{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/WHEEL +0 -0
{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/entry_points.txt +0 -0
{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/top_level.txt +0 -0

datachain/cli/parser/job.py CHANGED Viewed

@@ -17,7 +17,12 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
     )
     studio_run_help = "Run a job in Studio"
-    studio_run_description = "Run a job in Studio."
+    studio_run_description = "Run a job in Studio. \n"
+    studio_run_description += (
+        "When using --start-time or --cron,"
+        " the job is scheduled to run but won't start immediately"
+        " (can be seen in the Tasks tab in UI)"
+    )
     studio_run_parser = jobs_subparser.add_parser(
         "run",
@@ -96,6 +101,14 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         help="Priority for the job in range 0-5. "
         "Lower value is higher priority (default: 5)",
     )
+    studio_run_parser.add_argument(
+        "--start-time",
+        action="store",
+        help="Time to schedule a task in YYYY-MM-DDTHH:mm format or natural language.",
+    )
+    studio_run_parser.add_argument(
+        "--cron", action="store", help="Cron expression for the cron task."
+    )
     studio_ls_help = "List jobs in Studio"
     studio_ls_description = "List jobs in Studio."

datachain/lib/arrow.py CHANGED Viewed

@@ -245,7 +245,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa:
             if field.nullable and not ModelStore.is_pydantic(dtype):
                 dtype = Optional[dtype]  # type: ignore[assignment]
             type_dict[field.name] = dtype
-        return dict_to_data_model(column, type_dict)
+        return dict_to_data_model(f"ArrowDataModel_{column}", type_dict)
     if pa.types.is_map(col_type):
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):

datachain/lib/audio.py CHANGED Viewed

@@ -33,10 +33,14 @@ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
             frames = int(info.num_frames)
             duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
-            # Get format information
-            format_name = getattr(info, "format", "")
             codec_name = getattr(info, "encoding", "")
-            bit_rate = getattr(info, "bits_per_sample", 0) * sample_rate * channels
+            file_ext = file.get_file_ext().lower()
+            format_name = _encoding_to_format(codec_name, file_ext)
+            bits_per_sample = getattr(info, "bits_per_sample", 0)
+            bit_rate = (
+                bits_per_sample * sample_rate * channels if bits_per_sample > 0 else -1
+            )
     except Exception as exc:
         raise FileError(
@@ -54,7 +58,47 @@ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
     )
-def audio_fragment_np(
+def _encoding_to_format(encoding: str, file_ext: str) -> str:
+    """
+    Map torchaudio encoding to a format name.
+    Args:
+        encoding: The encoding string from torchaudio.info()
+        file_ext: The file extension as a fallback
+    Returns:
+        Format name as a string
+    """
+    # Direct mapping for formats that match exactly
+    encoding_map = {
+        "FLAC": "flac",
+        "MP3": "mp3",
+        "VORBIS": "ogg",
+        "AMR_WB": "amr",
+        "AMR_NB": "amr",
+        "OPUS": "opus",
+        "GSM": "gsm",
+    }
+    if encoding in encoding_map:
+        return encoding_map[encoding]
+    # For PCM variants, use file extension to determine format
+    if encoding.startswith("PCM_"):
+        # Common PCM formats by extension
+        pcm_formats = {
+            "wav": "wav",
+            "aiff": "aiff",
+            "au": "au",
+            "raw": "raw",
+        }
+        return pcm_formats.get(file_ext, "wav")  # Default to wav for PCM
+    # Fallback to file extension if encoding is unknown
+    return file_ext if file_ext else "unknown"
+def audio_to_np(
     audio: "AudioFile", start: float = 0, duration: Optional[float] = None
 ) -> "tuple[ndarray, int]":
     """Load audio fragment as numpy array.
@@ -98,14 +142,17 @@ def audio_fragment_np(
         ) from exc
-def audio_fragment_bytes(
+def audio_to_bytes(
     audio: "AudioFile",
+    format: str = "wav",
     start: float = 0,
     duration: Optional[float] = None,
-    format: str = "wav",
 ) -> bytes:
-    """Convert audio fragment to bytes using soundfile."""
-    y, sr = audio_fragment_np(audio, start, duration)
+    """Convert audio to bytes using soundfile.
+    If duration is None, converts from start to end of file.
+    If start is 0 and duration is None, converts entire file."""
+    y, sr = audio_to_np(audio, start, duration)
     import io
@@ -116,36 +163,82 @@ def audio_fragment_bytes(
     return buffer.getvalue()
-def save_audio_fragment(
+def save_audio(
     audio: "AudioFile",
-    start: float,
-    end: float,
     output: str,
     format: Optional[str] = None,
+    start: float = 0,
+    end: Optional[float] = None,
 ) -> "AudioFile":
-    """Save audio fragment with timestamped filename.
-    Supports local and remote storage upload."""
-    if start < 0 or end < 0 or start >= end:
-        raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+    """Save audio file or extract fragment to specified format.
+    Args:
+        audio: Source AudioFile object
+        output: Output directory path
+        format: Output format ('wav', 'mp3', etc). Defaults to source format
+        start: Start time in seconds (>= 0). Defaults to 0
+        end: End time in seconds. If None, extracts to end of file
+    Returns:
+        AudioFile: New audio file with format conversion/extraction applied
+    Examples:
+        save_audio(audio, "/path", "mp3")                       # Entire file to MP3
+        save_audio(audio, "s3://bucket/path", "wav", start=2.5) # From 2.5s to end
+        save_audio(audio, "/path", "flac", start=1, end=3)      # Extract 1-3s fragment
+    """
     if format is None:
         format = audio.get_file_ext()
-    duration = end - start
-    start_ms = int(start * 1000)
-    end_ms = int(end * 1000)
-    output_file = posixpath.join(
-        output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
-    )
+    # Validate start time
+    if start < 0:
+        raise ValueError(
+            f"Can't save audio for '{audio.path}', "
+            f"start time must be non-negative: {start:.3f}"
+        )
+    # Handle full file conversion when end is None and start is 0
+    if end is None and start == 0:
+        output_file = posixpath.join(output, f"{audio.get_file_stem()}.{format}")
+        try:
+            audio_bytes = audio_to_bytes(audio, format, start=0, duration=None)
+        except Exception as exc:
+            raise FileError(
+                "unable to convert audio file", audio.source, audio.path
+            ) from exc
+    elif end is None:
+        # Extract from start to end of file
+        output_file = posixpath.join(
+            output, f"{audio.get_file_stem()}_{int(start * 1000):06d}_end.{format}"
+        )
+        try:
+            audio_bytes = audio_to_bytes(audio, format, start=start, duration=None)
+        except Exception as exc:
+            raise FileError(
+                "unable to save audio fragment", audio.source, audio.path
+            ) from exc
+    else:
+        # Fragment extraction mode with specific end time
+        if end < 0 or start >= end:
+            raise ValueError(
+                f"Can't save audio for '{audio.path}', "
+                f"invalid time range: ({start:.3f}, {end:.3f})"
+            )
-    try:
-        audio_bytes = audio_fragment_bytes(audio, start, duration, format)
+        duration = end - start
+        start_ms = int(start * 1000)
+        end_ms = int(end * 1000)
+        output_file = posixpath.join(
+            output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
+        )
-        from datachain.lib.file import AudioFile
+        try:
+            audio_bytes = audio_to_bytes(audio, format, start, duration)
+        except Exception as exc:
+            raise FileError(
+                "unable to save audio fragment", audio.source, audio.path
+            ) from exc
-        return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
+    from datachain.lib.file import AudioFile
-    except Exception as exc:
-        raise FileError(
-            "unable to save audio fragment", audio.source, audio.path
-        ) from exc
+    return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)

datachain/lib/data_model.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import inspect
+import uuid
 from collections.abc import Sequence
 from datetime import datetime
 from typing import ClassVar, Optional, Union, get_args, get_origin
@@ -80,7 +82,9 @@ def dict_to_data_model(
     fields = {
         name: (
-            anno,
+            anno
+            if inspect.isclass(anno) and issubclass(anno, BaseModel)
+            else Optional[anno],
             Field(
                 validation_alias=AliasChoices(name, original_names[idx] or name),
                 default=None,
@@ -101,6 +105,10 @@ def dict_to_data_model(
                     field_info[str(alias)] = (_name, field)
             return field_info
+    # Generate random unique name if not provided
+    if not name:
+        name = f"DataModel_{uuid.uuid4().hex[:8]}"
     return create_model(
         name,
         __base__=_DataModelStrict,

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -2388,7 +2388,7 @@ class DataChain:
         placement: FileExportPlacement = "fullpath",
         link_type: Literal["copy", "symlink"] = "copy",
         num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
-        anon: bool = False,
+        anon: Optional[bool] = None,
         client_config: Optional[dict] = None,
     ) -> None:
         """Export files from a specified signal to a directory. Files can be
@@ -2403,7 +2403,11 @@ class DataChain:
                 Falls back to `'copy'` if symlinking fails.
             num_threads : number of threads to use for exporting files.
                 By default it uses 5 threads.
-            anon: If true, we will treat cloud bucket as public one
+            anon: If True, we will treat cloud bucket as public one. Default behavior
+                depends on the previous session configuration (e.g. happens in the
+                initial `read_storage`) and particular cloud storage client
+                implementation (e.g. S3 fallbacks to anonymous access if no credentials
+                were found).
             client_config: Optional configuration for the destination storage client
         Example:
@@ -2421,8 +2425,8 @@ class DataChain:
         ):
             raise ValueError("Files with the same name found")
-        if anon:
-            client_config = (client_config or {}) | {"anon": True}
+        if anon is not None:
+            client_config = (client_config or {}) | {"anon": anon}
         progress_bar = tqdm(
             desc=f"Exporting files to {output}: ",

datachain/lib/dc/hf.py CHANGED Viewed

@@ -25,19 +25,23 @@ def read_hf(
     settings: Optional[dict] = None,
     column: str = "",
     model_name: str = "",
+    limit: int = 0,
     **kwargs,
 ) -> "DataChain":
-    """Generate chain from huggingface hub dataset.
+    """Generate chain from Hugging Face Hub dataset.
     Parameters:
         dataset : Path or name of the dataset to read from Hugging Face Hub,
             or an instance of `datasets.Dataset`-like object.
-        args : Additional positional arguments to pass to datasets.load_dataset.
+        args : Additional positional arguments to pass to `datasets.load_dataset`.
         session : Session to use for the chain.
         settings : Settings to use for the chain.
         column : Generated object column name.
         model_name : Generated model name.
-        kwargs : Parameters to pass to datasets.load_dataset.
+        limit : Limit the number of items to read from the HF dataset.
+                Adds `take(limit)` to the `datasets.load_dataset`.
+                Defaults to 0 (no limit).
+        kwargs : Parameters to pass to `datasets.load_dataset`.
     Example:
         Load from Hugging Face Hub:
@@ -53,6 +57,18 @@ def read_hf(
         import datachain as dc
         chain = dc.read_hf(ds)
         ```
+        Streaming with limit, for large datasets:
+        ```py
+        import datachain as dc
+        ds = dc.read_hf("beans", split="train", streaming=True, limit=10)
+        ```
+        or use HF split syntax (not supported if streaming is enabled):
+        ```py
+        import datachain as dc
+        ds = dc.read_hf("beans", split="train[%10]")
+        ```
     """
     from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
@@ -72,4 +88,4 @@ def read_hf(
         output = {column: model}
     chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
-    return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
+    return chain.gen(HFGenerator(dataset, model, limit, *args, **kwargs), output=output)

datachain/lib/dc/storage.py CHANGED Viewed

@@ -33,7 +33,7 @@ def read_storage(
     recursive: Optional[bool] = True,
     column: str = "file",
     update: bool = False,
-    anon: bool = False,
+    anon: Optional[bool] = None,
     delta: Optional[bool] = False,
     delta_on: Optional[Union[str, Sequence[str]]] = (
         "file.path",
@@ -124,8 +124,8 @@ def read_storage(
     file_type = get_file_type(type)
-    if anon:
-        client_config = (client_config or {}) | {"anon": True}
+    if anon is not None:
+        client_config = (client_config or {}) | {"anon": anon}
     session = Session.get(session, client_config=client_config, in_memory=in_memory)
     catalog = session.catalog
     cache = catalog.cache

datachain/lib/file.py CHANGED Viewed

@@ -717,6 +717,23 @@ class ImageFile(File):
         destination = stringify_path(destination)
         client: Client = self._catalog.get_client(destination, **(client_config or {}))
+        # If format is not provided, determine it from the file extension
+        if format is None:
+            from pathlib import PurePosixPath
+            from PIL import Image as PilImage
+            ext = PurePosixPath(destination).suffix.lower()
+            format = PilImage.registered_extensions().get(ext)
+        if not format:
+            raise FileError(
+                f"Can't determine format for destination '{destination}'",
+                self.source,
+                self.path,
+            )
         with client.fs.open(destination, mode="wb") as f:
             self.read().save(f, format=format)
@@ -815,7 +832,10 @@ class VideoFile(File):
             VideoFragment: A Model representing the video fragment.
         """
         if start < 0 or end < 0 or start >= end:
-            raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+            raise ValueError(
+                f"Can't get video fragment for '{self.path}', "
+                f"invalid time range: ({start:.3f}, {end:.3f})"
+            )
         return VideoFragment(video=self, start=start, end=end)
@@ -898,7 +918,10 @@ class AudioFile(File):
             AudioFragment: A Model representing the audio fragment.
         """
         if start < 0 or end < 0 or start >= end:
-            raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+            raise ValueError(
+                f"Can't get audio fragment for '{self.path}', "
+                f"invalid time range: ({start:.3f}, {end:.3f})"
+            )
         return AudioFragment(audio=self, start=start, end=end)
@@ -941,6 +964,35 @@ class AudioFile(File):
             yield self.get_fragment(start, min(start + duration, end))
             start += duration
+    def save(  # type: ignore[override]
+        self,
+        output: str,
+        format: Optional[str] = None,
+        start: float = 0,
+        end: Optional[float] = None,
+        client_config: Optional[dict] = None,
+    ) -> "AudioFile":
+        """Save audio file or extract fragment to specified format.
+        Args:
+            output: Output directory path
+            format: Output format ('wav', 'mp3', etc). Defaults to source format
+            start: Start time in seconds (>= 0). Defaults to 0
+            end: End time in seconds. If None, extracts to end of file
+            client_config: Optional client configuration
+        Returns:
+            AudioFile: New audio file with format conversion/extraction applied
+        Examples:
+            audio.save("/path", "mp3")                        # Entire file to MP3
+            audio.save("s3://bucket/path", "wav", start=2.5)  # From 2.5s to end as WAV
+            audio.save("/path", "flac", start=1, end=3)       # 1-3s fragment as FLAC
+        """
+        from .audio import save_audio
+        return save_audio(self, output, format, start, end)
 class AudioFragment(DataModel):
     """
@@ -968,10 +1020,10 @@ class AudioFragment(DataModel):
             tuple[ndarray, int]: A tuple containing the audio data as a NumPy array
                                and the sample rate.
         """
-        from .audio import audio_fragment_np
+        from .audio import audio_to_np
         duration = self.end - self.start
-        return audio_fragment_np(self.audio, self.start, duration)
+        return audio_to_np(self.audio, self.start, duration)
     def read_bytes(self, format: str = "wav") -> bytes:
         """
@@ -984,10 +1036,10 @@ class AudioFragment(DataModel):
         Returns:
             bytes: The encoded audio fragment as bytes.
         """
-        from .audio import audio_fragment_bytes
+        from .audio import audio_to_bytes
         duration = self.end - self.start
-        return audio_fragment_bytes(self.audio, self.start, duration, format)
+        return audio_to_bytes(self.audio, format, self.start, duration)
     def save(self, output: str, format: Optional[str] = None) -> "AudioFile":
         """
@@ -1005,9 +1057,9 @@ class AudioFragment(DataModel):
         Returns:
             AudioFile: A Model representing the saved audio file.
         """
-        from .audio import save_audio_fragment
+        from .audio import save_audio
-        return save_audio_fragment(self.audio, self.start, self.end, output, format)
+        return save_audio(self.audio, output, format, self.start, self.end)
 class VideoFrame(DataModel):

datachain/lib/hf.py CHANGED Viewed

@@ -69,21 +69,25 @@ class HFGenerator(Generator):
         self,
         ds: Union[str, HFDatasetType],
         output_schema: type["BaseModel"],
+        limit: int = 0,
         *args,
         **kwargs,
     ):
         """
-        Generator for chain from huggingface datasets.
+        Generator for chain from Hugging Face datasets.
         Parameters:
-        ds : Path or name of the dataset to read from Hugging Face Hub,
-            or an instance of `datasets.Dataset`-like object.
-        output_schema : Pydantic model for validation.
+            ds : Path or name of the dataset to read from Hugging Face Hub,
+                or an instance of `datasets.Dataset`-like object.
+            limit : Limit the number of items to read from the HF dataset.
+                    Defaults to 0 (no limit).
+            output_schema : Pydantic model for validation.
         """
         super().__init__()
         self.ds = ds
         self.output_schema = output_schema
+        self.limit = limit
         self.args = args
         self.kwargs = kwargs
@@ -93,6 +97,8 @@ class HFGenerator(Generator):
     def process(self, split: str = ""):
         desc = "Parsed Hugging Face dataset"
         ds = self.ds_dict[split]
+        if self.limit > 0:
+            ds = ds.take(self.limit)
         if split:
             desc += f" split '{split}'"
         model_fields = self.output_schema._model_fields_by_aliases()  # type: ignore[attr-defined]
@@ -113,7 +119,6 @@ class HFGenerator(Generator):
 def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
     if isinstance(ds, str):
-        kwargs["streaming"] = True
         ds = load_dataset(ds, *args, **kwargs)
     if isinstance(ds, (DatasetDict, IterableDatasetDict)):
         return ds
@@ -132,7 +137,12 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
             sfeat = feat[sname]
             norm_name, info = model_fields[sname]
             sanno = info.annotation
-            sdict[norm_name] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
+            if isinstance(val[sname], list):
+                sdict[norm_name] = [
+                    convert_feature(v, sfeat, sanno) for v in val[sname]
+                ]
+            else:
+                sdict[norm_name] = convert_feature(val[sname], sfeat, sanno)
         return anno(**sdict)
     if isinstance(feat, Image):
         if isinstance(val, dict):
@@ -174,7 +184,7 @@ def _feature_to_chain_type(name: str, val: Any) -> DataType:  # noqa: PLR0911
         for sname, sval in val.items():
             dtype = _feature_to_chain_type(sname, sval)
             sequence_dict[sname] = dtype  # type: ignore[valid-type]
-        return dict_to_data_model(name, sequence_dict)  # type: ignore[arg-type]
+        return dict_to_data_model(f"HFDataModel_{name}", sequence_dict)  # type: ignore[arg-type]
     if isinstance(val, List):
         return list[_feature_to_chain_type(name, val.feature)]  # type: ignore[arg-type,misc,return-value]
     if isinstance(val, Array2D):

datachain/lib/video.py CHANGED Viewed

@@ -205,7 +205,10 @@ def save_video_fragment(
         VideoFile: Video fragment model.
     """
     if start < 0 or end < 0 or start >= end:
-        raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+        raise ValueError(
+            f"Can't save video fragment for '{video.path}', "
+            f"invalid time range: ({start:.3f}, {end:.3f})"
+        )
     if format is None:
         format = video.get_file_ext()

datachain/remote/studio.py CHANGED Viewed

@@ -429,6 +429,8 @@ class StudioClient:
         repository: Optional[str] = None,
         priority: Optional[int] = None,
         cluster: Optional[str] = None,
+        start_time: Optional[str] = None,
+        cron: Optional[str] = None,
     ) -> Response[JobData]:
         data = {
             "query": query,
@@ -442,6 +444,8 @@ class StudioClient:
             "repository": repository,
             "priority": priority,
             "compute_cluster_name": cluster,
+            "start_after": start_time,
+            "cron_expression": cron,
         }
         return self._send_request("datachain/job", data)

datachain/studio.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import asyncio
 import os
 import sys
+from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Optional
+import dateparser
 import tabulate
 from datachain.config import Config, ConfigLevel
@@ -42,6 +44,8 @@ def process_jobs_args(args: "Namespace"):
             args.req_file,
             args.priority,
             args.cluster,
+            args.start_time,
+            args.cron,
         )
     if args.cmd == "cancel":
@@ -262,6 +266,24 @@ def save_config(hostname, token, level=ConfigLevel.GLOBAL):
     return config.config_file()
+def parse_start_time(start_time_str: Optional[str]) -> Optional[str]:
+    if not start_time_str:
+        return None
+    # Parse the datetime string using dateparser
+    parsed_datetime = dateparser.parse(start_time_str)
+    if parsed_datetime is None:
+        raise DataChainError(
+            f"Could not parse datetime string: '{start_time_str}'. "
+            f"Supported formats include: '2024-01-15 14:30:00', 'tomorrow 3pm', "
+            f"'monday 9am', '2024-01-15T14:30:00Z', 'in 2 hours', etc."
+        )
+    # Convert to ISO format string
+    return parsed_datetime.isoformat()
 def show_logs_from_client(client, job_id):
     # Sync usage
     async def _run():
@@ -310,6 +332,8 @@ def create_job(
     req_file: Optional[str] = None,
     priority: Optional[int] = None,
     cluster: Optional[str] = None,
+    start_time: Optional[str] = None,
+    cron: Optional[str] = None,
 ):
     query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
     with open(query_file) as f:
@@ -328,6 +352,11 @@ def create_job(
     client = StudioClient(team=team_name)
     file_ids = upload_files(client, files) if files else []
+    # Parse start_time if provided
+    parsed_start_time = parse_start_time(start_time)
+    if cron and parsed_start_time is None:
+        parsed_start_time = datetime.now(timezone.utc).isoformat()
     response = client.create_job(
         query=query,
         query_type=query_type,
@@ -340,6 +369,8 @@ def create_job(
         requirements=requirements,
         priority=priority,
         cluster=cluster,
+        start_time=parsed_start_time,
+        cron=cron,
     )
     if not response.ok:
         raise DataChainError(response.message)
@@ -348,6 +379,11 @@ def create_job(
         raise DataChainError("Failed to create job")
     job_id = response.data.get("job", {}).get("id")
+    if parsed_start_time or cron:
+        print(f"Job {job_id} is scheduled as a task in Studio.")
+        return 0
     print(f"Job {job_id} created")
     print("Open the job in Studio at", response.data.get("job", {}).get("url"))
     print("=" * 40)

{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.26.3
+Version: 0.27.0
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0
@@ -26,6 +26,7 @@ Requires-Dist: packaging
 Requires-Dist: pyarrow
 Requires-Dist: typing-extensions
 Requires-Dist: python-dateutil>=2
+Requires-Dist: dateparser>=1.0.0
 Requires-Dist: attrs>=21.3.0
 Requires-Dist: fsspec>=2024.2.0
 Requires-Dist: s3fs>=2024.2.0
@@ -100,6 +101,7 @@ Provides-Extra: dev
 Requires-Dist: datachain[docs,tests]; extra == "dev"
 Requires-Dist: mypy==1.17.0; extra == "dev"
 Requires-Dist: types-python-dateutil; extra == "dev"
+Requires-Dist: types-dateparser; extra == "dev"
 Requires-Dist: types-pytz; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"
 Requires-Dist: types-requests; extra == "dev"
@@ -118,7 +120,7 @@ Dynamic: license-file
 |logo| DataChain
 ================
-|PyPI| |Python Version| |Codecov| |Tests|
+|PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
 .. |logo| image:: docs/assets/datachain.svg
    :height: 24
@@ -134,6 +136,9 @@ Dynamic: license-file
 .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
    :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
    :alt: Tests
+.. |DeepWiki| image:: https://deepwiki.com/badge.svg
+   :target: https://deepwiki.com/iterative/datachain
+   :alt: DeepWiki
 DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
 data like images, audio, videos, text and PDFs. It integrates with external storage

{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/RECORD RENAMED Viewed

@@ -17,7 +17,7 @@ datachain/project.py,sha256=90D4GpJSA3t0fayYZbzrL3sk4U7EJhQo8psnWvdI7_o,2280
 datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
 datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
-datachain/studio.py,sha256=bLok-eJNFRHQScEyAyA_Fas52dmijd5r-73KudWxV4k,13337
+datachain/studio.py,sha256=RCpVZdHRX-ClEddXaAsZDGFy5o-SOqVCa5NhLj8337s,14486
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
 datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
 datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
@@ -35,7 +35,7 @@ datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibV
 datachain/cli/commands/query.py,sha256=Xzfgh14nPVH-sclqX1tpZqgfdTugw5s_44v0D33z6FA,1505
 datachain/cli/commands/show.py,sha256=Cf8wBs12h-xtdOzjU5GTDy2C8rF5HJSF0hDJYER1zH8,1606
 datachain/cli/parser/__init__.py,sha256=NPB6ssP4CCt7G1SWZ_8oNQEH2C1lktWgkyHYXDQJZNc,15073
-datachain/cli/parser/job.py,sha256=_wqOOxGRXG_-xuQ35FaLUOwjw6w8HviWvoEpZZ7VBzI,5289
+datachain/cli/parser/job.py,sha256=iytBZaCcQUhaOcRlYZFeAJsscN2T2XcEY7MibTeuZhg,5786
 datachain/cli/parser/studio.py,sha256=Bo__LKM7qhJGgkyX8M_bCvgZ2Gvqq6r_X4t1NdtaBIY,3881
 datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
 datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
@@ -70,13 +70,13 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
 datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
 datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/lib/arrow.py,sha256=gMgmiMOhTGFMSyWBbjyzF2RsSXjx0XmUGPoSBxcWwe0,10756
-datachain/lib/audio.py,sha256=J7XJ14ItPF9y6pN-tmMV9In9X9rgwlBwzyzdGOUkPGk,4376
+datachain/lib/arrow.py,sha256=geoLvyDd5uMqS3D9Ec1ODlShCUAdtwHUwl8FqbUX_hg,10776
+datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
-datachain/lib/data_model.py,sha256=JPHPO6z-pehyiY-qNBAnp8u015xUHrijPKbGkMHS6lo,3493
+datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
 datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
-datachain/lib/file.py,sha256=tHBBacsh1580UPFC6fAINBNwNiyymNgzj89rpsz1LKc,40817
-datachain/lib/hf.py,sha256=dadHs2dsi4ALwXz92Y3T7AUgq3wQF4mBydWqHCMjvks,6880
+datachain/lib/file.py,sha256=_ch7xYcpl0kzImgEwccbQ-a5qb9rbEvx1vcuWerOn9k,42608
+datachain/lib/hf.py,sha256=3xdvPQPilnJiGv3H4S4bTGqvrGGlZgZmqjE1n_SMJZg,7293
 datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
 datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
 datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
@@ -92,7 +92,7 @@ datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
 datachain/lib/udf.py,sha256=SUnJWRDC3TlLhvpi8iqqJbeZGn5DChot7DyH-0Q-z20,17305
 datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
 datachain/lib/utils.py,sha256=rG2y7NwTqZOuomZZRmrA-Q-ANM_j1cToQYqDJoOeGyU,1480
-datachain/lib/video.py,sha256=u6fLJWj5G6QqsVkpfHnKGklBNpG3BRRg6v3izngnNcU,6767
+datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
 datachain/lib/webdataset.py,sha256=CkW8FfGigNx6wo2EEK4KMjhEE8FamRHWGs2HZuH7jDY,7214
 datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
 datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -104,15 +104,15 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
 datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
 datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
 datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
-datachain/lib/dc/datachain.py,sha256=ap54lcuj71tvp0zX1jiFFiEWvA5UPeyYJRJkd2APmlI,92897
+datachain/lib/dc/datachain.py,sha256=mLE5v4KhzEQm7HVWBTxY6EwJ2J-YeFVcLUY4I21216c,93212
 datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
-datachain/lib/dc/hf.py,sha256=MJWO-NL4jAD6CEAmXsyeqXEyvefRLMhyxhT9jKT5vMU,2324
+datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
 datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
 datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
 datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
 datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
 datachain/lib/dc/records.py,sha256=FpPbApWopUri1gIaSMsfXN4fevja4mjmfb6Q5eiaGxI,3116
-datachain/lib/dc/storage.py,sha256=8xiV3c6k-sG14RGwNJCp0AbV6L0mNDsTVZ-Est-ccnw,7672
+datachain/lib/dc/storage.py,sha256=FXroEdxOZfbuEBIWfWTkbGwrI0D4_mrLZSRsIQm0WFE,7693
 datachain/lib/dc/utils.py,sha256=VawOAlJSvAtZbsMg33s5tJe21TRx1Km3QggI1nN6tnw,3984
 datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
 datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -136,7 +136,7 @@ datachain/query/session.py,sha256=gKblltJAVQAVSTswAgWGDgGbpmFlFzFVkIQojDCjgXM,68
 datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
 datachain/query/utils.py,sha256=HaSDNH_XGvp_NIcXjcB7j4vJRPi4_tbztDWclYelHY4,1208
 datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/remote/studio.py,sha256=oJp2KD9eO8zQDnPfNpAALZYsOlBfqVKKRTeCkEpcsYk,15196
+datachain/remote/studio.py,sha256=vsuqCAO65PBJKGLMxOvc3Bmieo2TJwcfc9YclxkzmFk,15350
 datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
 datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
 datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -158,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
 datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.26.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.26.3.dist-info/METADATA,sha256=HdG_quEq0rfrdKJJ_teSViVCXbXI3SxLlnh6tu2Mgfs,13543
-datachain-0.26.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-datachain-0.26.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.26.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.26.3.dist-info/RECORD,,
+datachain-0.27.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.27.0.dist-info/METADATA,sha256=PWZ_EWTpk1OvWlQZe__5SCjFem6BD1AtYmTxJ5wV3iY,13759
+datachain-0.27.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+datachain-0.27.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.27.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.27.0.dist-info/RECORD,,

{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{datachain-0.26.3.dist-info → datachain-0.27.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.26.3__py3-none-any.whl → 0.27.0__py3-none-any.whl

Potentially problematic release.

datachain 0.26.3py3-none-any.whl → 0.27.0py3-none-any.whl