datachain 0.26.3__py3-none-any.whl → 0.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -17,7 +17,12 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
17
17
  )
18
18
 
19
19
  studio_run_help = "Run a job in Studio"
20
- studio_run_description = "Run a job in Studio."
20
+ studio_run_description = "Run a job in Studio. \n"
21
+ studio_run_description += (
22
+ "When using --start-time or --cron,"
23
+ " the job is scheduled to run but won't start immediately"
24
+ " (can be seen in the Tasks tab in UI)"
25
+ )
21
26
 
22
27
  studio_run_parser = jobs_subparser.add_parser(
23
28
  "run",
@@ -96,6 +101,14 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
96
101
  help="Priority for the job in range 0-5. "
97
102
  "Lower value is higher priority (default: 5)",
98
103
  )
104
+ studio_run_parser.add_argument(
105
+ "--start-time",
106
+ action="store",
107
+ help="Time to schedule a task in YYYY-MM-DDTHH:mm format or natural language.",
108
+ )
109
+ studio_run_parser.add_argument(
110
+ "--cron", action="store", help="Cron expression for the cron task."
111
+ )
99
112
 
100
113
  studio_ls_help = "List jobs in Studio"
101
114
  studio_ls_description = "List jobs in Studio."
datachain/lib/arrow.py CHANGED
@@ -245,7 +245,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
245
245
  if field.nullable and not ModelStore.is_pydantic(dtype):
246
246
  dtype = Optional[dtype] # type: ignore[assignment]
247
247
  type_dict[field.name] = dtype
248
- return dict_to_data_model(column, type_dict)
248
+ return dict_to_data_model(f"ArrowDataModel_{column}", type_dict)
249
249
  if pa.types.is_map(col_type):
250
250
  return dict
251
251
  if isinstance(col_type, pa.lib.DictionaryType):
datachain/lib/audio.py CHANGED
@@ -33,10 +33,14 @@ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
33
33
  frames = int(info.num_frames)
34
34
  duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
35
35
 
36
- # Get format information
37
- format_name = getattr(info, "format", "")
38
36
  codec_name = getattr(info, "encoding", "")
39
- bit_rate = getattr(info, "bits_per_sample", 0) * sample_rate * channels
37
+ file_ext = file.get_file_ext().lower()
38
+ format_name = _encoding_to_format(codec_name, file_ext)
39
+
40
+ bits_per_sample = getattr(info, "bits_per_sample", 0)
41
+ bit_rate = (
42
+ bits_per_sample * sample_rate * channels if bits_per_sample > 0 else -1
43
+ )
40
44
 
41
45
  except Exception as exc:
42
46
  raise FileError(
@@ -54,7 +58,47 @@ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
54
58
  )
55
59
 
56
60
 
57
- def audio_fragment_np(
61
+ def _encoding_to_format(encoding: str, file_ext: str) -> str:
62
+ """
63
+ Map torchaudio encoding to a format name.
64
+
65
+ Args:
66
+ encoding: The encoding string from torchaudio.info()
67
+ file_ext: The file extension as a fallback
68
+
69
+ Returns:
70
+ Format name as a string
71
+ """
72
+ # Direct mapping for formats that match exactly
73
+ encoding_map = {
74
+ "FLAC": "flac",
75
+ "MP3": "mp3",
76
+ "VORBIS": "ogg",
77
+ "AMR_WB": "amr",
78
+ "AMR_NB": "amr",
79
+ "OPUS": "opus",
80
+ "GSM": "gsm",
81
+ }
82
+
83
+ if encoding in encoding_map:
84
+ return encoding_map[encoding]
85
+
86
+ # For PCM variants, use file extension to determine format
87
+ if encoding.startswith("PCM_"):
88
+ # Common PCM formats by extension
89
+ pcm_formats = {
90
+ "wav": "wav",
91
+ "aiff": "aiff",
92
+ "au": "au",
93
+ "raw": "raw",
94
+ }
95
+ return pcm_formats.get(file_ext, "wav") # Default to wav for PCM
96
+
97
+ # Fallback to file extension if encoding is unknown
98
+ return file_ext if file_ext else "unknown"
99
+
100
+
101
+ def audio_to_np(
58
102
  audio: "AudioFile", start: float = 0, duration: Optional[float] = None
59
103
  ) -> "tuple[ndarray, int]":
60
104
  """Load audio fragment as numpy array.
@@ -98,14 +142,17 @@ def audio_fragment_np(
98
142
  ) from exc
99
143
 
100
144
 
101
- def audio_fragment_bytes(
145
+ def audio_to_bytes(
102
146
  audio: "AudioFile",
147
+ format: str = "wav",
103
148
  start: float = 0,
104
149
  duration: Optional[float] = None,
105
- format: str = "wav",
106
150
  ) -> bytes:
107
- """Convert audio fragment to bytes using soundfile."""
108
- y, sr = audio_fragment_np(audio, start, duration)
151
+ """Convert audio to bytes using soundfile.
152
+
153
+ If duration is None, converts from start to end of file.
154
+ If start is 0 and duration is None, converts entire file."""
155
+ y, sr = audio_to_np(audio, start, duration)
109
156
 
110
157
  import io
111
158
 
@@ -116,36 +163,82 @@ def audio_fragment_bytes(
116
163
  return buffer.getvalue()
117
164
 
118
165
 
119
- def save_audio_fragment(
166
+ def save_audio(
120
167
  audio: "AudioFile",
121
- start: float,
122
- end: float,
123
168
  output: str,
124
169
  format: Optional[str] = None,
170
+ start: float = 0,
171
+ end: Optional[float] = None,
125
172
  ) -> "AudioFile":
126
- """Save audio fragment with timestamped filename.
127
- Supports local and remote storage upload."""
128
- if start < 0 or end < 0 or start >= end:
129
- raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
130
-
173
+ """Save audio file or extract fragment to specified format.
174
+
175
+ Args:
176
+ audio: Source AudioFile object
177
+ output: Output directory path
178
+ format: Output format ('wav', 'mp3', etc). Defaults to source format
179
+ start: Start time in seconds (>= 0). Defaults to 0
180
+ end: End time in seconds. If None, extracts to end of file
181
+
182
+ Returns:
183
+ AudioFile: New audio file with format conversion/extraction applied
184
+
185
+ Examples:
186
+ save_audio(audio, "/path", "mp3") # Entire file to MP3
187
+ save_audio(audio, "s3://bucket/path", "wav", start=2.5) # From 2.5s to end
188
+ save_audio(audio, "/path", "flac", start=1, end=3) # Extract 1-3s fragment
189
+ """
131
190
  if format is None:
132
191
  format = audio.get_file_ext()
133
192
 
134
- duration = end - start
135
- start_ms = int(start * 1000)
136
- end_ms = int(end * 1000)
137
- output_file = posixpath.join(
138
- output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
139
- )
193
+ # Validate start time
194
+ if start < 0:
195
+ raise ValueError(
196
+ f"Can't save audio for '{audio.path}', "
197
+ f"start time must be non-negative: {start:.3f}"
198
+ )
199
+
200
+ # Handle full file conversion when end is None and start is 0
201
+ if end is None and start == 0:
202
+ output_file = posixpath.join(output, f"{audio.get_file_stem()}.{format}")
203
+ try:
204
+ audio_bytes = audio_to_bytes(audio, format, start=0, duration=None)
205
+ except Exception as exc:
206
+ raise FileError(
207
+ "unable to convert audio file", audio.source, audio.path
208
+ ) from exc
209
+ elif end is None:
210
+ # Extract from start to end of file
211
+ output_file = posixpath.join(
212
+ output, f"{audio.get_file_stem()}_{int(start * 1000):06d}_end.{format}"
213
+ )
214
+ try:
215
+ audio_bytes = audio_to_bytes(audio, format, start=start, duration=None)
216
+ except Exception as exc:
217
+ raise FileError(
218
+ "unable to save audio fragment", audio.source, audio.path
219
+ ) from exc
220
+ else:
221
+ # Fragment extraction mode with specific end time
222
+ if end < 0 or start >= end:
223
+ raise ValueError(
224
+ f"Can't save audio for '{audio.path}', "
225
+ f"invalid time range: ({start:.3f}, {end:.3f})"
226
+ )
140
227
 
141
- try:
142
- audio_bytes = audio_fragment_bytes(audio, start, duration, format)
228
+ duration = end - start
229
+ start_ms = int(start * 1000)
230
+ end_ms = int(end * 1000)
231
+ output_file = posixpath.join(
232
+ output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
233
+ )
143
234
 
144
- from datachain.lib.file import AudioFile
235
+ try:
236
+ audio_bytes = audio_to_bytes(audio, format, start, duration)
237
+ except Exception as exc:
238
+ raise FileError(
239
+ "unable to save audio fragment", audio.source, audio.path
240
+ ) from exc
145
241
 
146
- return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
242
+ from datachain.lib.file import AudioFile
147
243
 
148
- except Exception as exc:
149
- raise FileError(
150
- "unable to save audio fragment", audio.source, audio.path
151
- ) from exc
244
+ return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
@@ -1,3 +1,5 @@
1
+ import inspect
2
+ import uuid
1
3
  from collections.abc import Sequence
2
4
  from datetime import datetime
3
5
  from typing import ClassVar, Optional, Union, get_args, get_origin
@@ -80,7 +82,9 @@ def dict_to_data_model(
80
82
 
81
83
  fields = {
82
84
  name: (
83
- anno,
85
+ anno
86
+ if inspect.isclass(anno) and issubclass(anno, BaseModel)
87
+ else Optional[anno],
84
88
  Field(
85
89
  validation_alias=AliasChoices(name, original_names[idx] or name),
86
90
  default=None,
@@ -101,6 +105,10 @@ def dict_to_data_model(
101
105
  field_info[str(alias)] = (_name, field)
102
106
  return field_info
103
107
 
108
+ # Generate random unique name if not provided
109
+ if not name:
110
+ name = f"DataModel_{uuid.uuid4().hex[:8]}"
111
+
104
112
  return create_model(
105
113
  name,
106
114
  __base__=_DataModelStrict,
@@ -2388,7 +2388,7 @@ class DataChain:
2388
2388
  placement: FileExportPlacement = "fullpath",
2389
2389
  link_type: Literal["copy", "symlink"] = "copy",
2390
2390
  num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
2391
- anon: bool = False,
2391
+ anon: Optional[bool] = None,
2392
2392
  client_config: Optional[dict] = None,
2393
2393
  ) -> None:
2394
2394
  """Export files from a specified signal to a directory. Files can be
@@ -2403,7 +2403,11 @@ class DataChain:
2403
2403
  Falls back to `'copy'` if symlinking fails.
2404
2404
  num_threads : number of threads to use for exporting files.
2405
2405
  By default it uses 5 threads.
2406
- anon: If true, we will treat cloud bucket as public one
2406
+ anon: If True, we will treat cloud bucket as public one. Default behavior
2407
+ depends on the previous session configuration (e.g. happens in the
2408
+ initial `read_storage`) and particular cloud storage client
2409
+ implementation (e.g. S3 fallbacks to anonymous access if no credentials
2410
+ were found).
2407
2411
  client_config: Optional configuration for the destination storage client
2408
2412
 
2409
2413
  Example:
@@ -2421,8 +2425,8 @@ class DataChain:
2421
2425
  ):
2422
2426
  raise ValueError("Files with the same name found")
2423
2427
 
2424
- if anon:
2425
- client_config = (client_config or {}) | {"anon": True}
2428
+ if anon is not None:
2429
+ client_config = (client_config or {}) | {"anon": anon}
2426
2430
 
2427
2431
  progress_bar = tqdm(
2428
2432
  desc=f"Exporting files to {output}: ",
datachain/lib/dc/hf.py CHANGED
@@ -25,19 +25,23 @@ def read_hf(
25
25
  settings: Optional[dict] = None,
26
26
  column: str = "",
27
27
  model_name: str = "",
28
+ limit: int = 0,
28
29
  **kwargs,
29
30
  ) -> "DataChain":
30
- """Generate chain from huggingface hub dataset.
31
+ """Generate chain from Hugging Face Hub dataset.
31
32
 
32
33
  Parameters:
33
34
  dataset : Path or name of the dataset to read from Hugging Face Hub,
34
35
  or an instance of `datasets.Dataset`-like object.
35
- args : Additional positional arguments to pass to datasets.load_dataset.
36
+ args : Additional positional arguments to pass to `datasets.load_dataset`.
36
37
  session : Session to use for the chain.
37
38
  settings : Settings to use for the chain.
38
39
  column : Generated object column name.
39
40
  model_name : Generated model name.
40
- kwargs : Parameters to pass to datasets.load_dataset.
41
+ limit : Limit the number of items to read from the HF dataset.
42
+ Adds `take(limit)` to the `datasets.load_dataset`.
43
+ Defaults to 0 (no limit).
44
+ kwargs : Parameters to pass to `datasets.load_dataset`.
41
45
 
42
46
  Example:
43
47
  Load from Hugging Face Hub:
@@ -53,6 +57,18 @@ def read_hf(
53
57
  import datachain as dc
54
58
  chain = dc.read_hf(ds)
55
59
  ```
60
+
61
+ Streaming with limit, for large datasets:
62
+ ```py
63
+ import datachain as dc
64
+ ds = dc.read_hf("beans", split="train", streaming=True, limit=10)
65
+ ```
66
+
67
+ or use HF split syntax (not supported if streaming is enabled):
68
+ ```py
69
+ import datachain as dc
70
+ ds = dc.read_hf("beans", split="train[%10]")
71
+ ```
56
72
  """
57
73
  from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
58
74
 
@@ -72,4 +88,4 @@ def read_hf(
72
88
  output = {column: model}
73
89
 
74
90
  chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
75
- return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
91
+ return chain.gen(HFGenerator(dataset, model, limit, *args, **kwargs), output=output)
@@ -33,7 +33,7 @@ def read_storage(
33
33
  recursive: Optional[bool] = True,
34
34
  column: str = "file",
35
35
  update: bool = False,
36
- anon: bool = False,
36
+ anon: Optional[bool] = None,
37
37
  delta: Optional[bool] = False,
38
38
  delta_on: Optional[Union[str, Sequence[str]]] = (
39
39
  "file.path",
@@ -124,8 +124,8 @@ def read_storage(
124
124
 
125
125
  file_type = get_file_type(type)
126
126
 
127
- if anon:
128
- client_config = (client_config or {}) | {"anon": True}
127
+ if anon is not None:
128
+ client_config = (client_config or {}) | {"anon": anon}
129
129
  session = Session.get(session, client_config=client_config, in_memory=in_memory)
130
130
  catalog = session.catalog
131
131
  cache = catalog.cache
datachain/lib/file.py CHANGED
@@ -717,6 +717,23 @@ class ImageFile(File):
717
717
  destination = stringify_path(destination)
718
718
 
719
719
  client: Client = self._catalog.get_client(destination, **(client_config or {}))
720
+
721
+ # If format is not provided, determine it from the file extension
722
+ if format is None:
723
+ from pathlib import PurePosixPath
724
+
725
+ from PIL import Image as PilImage
726
+
727
+ ext = PurePosixPath(destination).suffix.lower()
728
+ format = PilImage.registered_extensions().get(ext)
729
+
730
+ if not format:
731
+ raise FileError(
732
+ f"Can't determine format for destination '{destination}'",
733
+ self.source,
734
+ self.path,
735
+ )
736
+
720
737
  with client.fs.open(destination, mode="wb") as f:
721
738
  self.read().save(f, format=format)
722
739
 
@@ -815,7 +832,10 @@ class VideoFile(File):
815
832
  VideoFragment: A Model representing the video fragment.
816
833
  """
817
834
  if start < 0 or end < 0 or start >= end:
818
- raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
835
+ raise ValueError(
836
+ f"Can't get video fragment for '{self.path}', "
837
+ f"invalid time range: ({start:.3f}, {end:.3f})"
838
+ )
819
839
 
820
840
  return VideoFragment(video=self, start=start, end=end)
821
841
 
@@ -898,7 +918,10 @@ class AudioFile(File):
898
918
  AudioFragment: A Model representing the audio fragment.
899
919
  """
900
920
  if start < 0 or end < 0 or start >= end:
901
- raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
921
+ raise ValueError(
922
+ f"Can't get audio fragment for '{self.path}', "
923
+ f"invalid time range: ({start:.3f}, {end:.3f})"
924
+ )
902
925
 
903
926
  return AudioFragment(audio=self, start=start, end=end)
904
927
 
@@ -941,6 +964,35 @@ class AudioFile(File):
941
964
  yield self.get_fragment(start, min(start + duration, end))
942
965
  start += duration
943
966
 
967
+ def save( # type: ignore[override]
968
+ self,
969
+ output: str,
970
+ format: Optional[str] = None,
971
+ start: float = 0,
972
+ end: Optional[float] = None,
973
+ client_config: Optional[dict] = None,
974
+ ) -> "AudioFile":
975
+ """Save audio file or extract fragment to specified format.
976
+
977
+ Args:
978
+ output: Output directory path
979
+ format: Output format ('wav', 'mp3', etc). Defaults to source format
980
+ start: Start time in seconds (>= 0). Defaults to 0
981
+ end: End time in seconds. If None, extracts to end of file
982
+ client_config: Optional client configuration
983
+
984
+ Returns:
985
+ AudioFile: New audio file with format conversion/extraction applied
986
+
987
+ Examples:
988
+ audio.save("/path", "mp3") # Entire file to MP3
989
+ audio.save("s3://bucket/path", "wav", start=2.5) # From 2.5s to end as WAV
990
+ audio.save("/path", "flac", start=1, end=3) # 1-3s fragment as FLAC
991
+ """
992
+ from .audio import save_audio
993
+
994
+ return save_audio(self, output, format, start, end)
995
+
944
996
 
945
997
  class AudioFragment(DataModel):
946
998
  """
@@ -968,10 +1020,10 @@ class AudioFragment(DataModel):
968
1020
  tuple[ndarray, int]: A tuple containing the audio data as a NumPy array
969
1021
  and the sample rate.
970
1022
  """
971
- from .audio import audio_fragment_np
1023
+ from .audio import audio_to_np
972
1024
 
973
1025
  duration = self.end - self.start
974
- return audio_fragment_np(self.audio, self.start, duration)
1026
+ return audio_to_np(self.audio, self.start, duration)
975
1027
 
976
1028
  def read_bytes(self, format: str = "wav") -> bytes:
977
1029
  """
@@ -984,10 +1036,10 @@ class AudioFragment(DataModel):
984
1036
  Returns:
985
1037
  bytes: The encoded audio fragment as bytes.
986
1038
  """
987
- from .audio import audio_fragment_bytes
1039
+ from .audio import audio_to_bytes
988
1040
 
989
1041
  duration = self.end - self.start
990
- return audio_fragment_bytes(self.audio, self.start, duration, format)
1042
+ return audio_to_bytes(self.audio, format, self.start, duration)
991
1043
 
992
1044
  def save(self, output: str, format: Optional[str] = None) -> "AudioFile":
993
1045
  """
@@ -1005,9 +1057,9 @@ class AudioFragment(DataModel):
1005
1057
  Returns:
1006
1058
  AudioFile: A Model representing the saved audio file.
1007
1059
  """
1008
- from .audio import save_audio_fragment
1060
+ from .audio import save_audio
1009
1061
 
1010
- return save_audio_fragment(self.audio, self.start, self.end, output, format)
1062
+ return save_audio(self.audio, output, format, self.start, self.end)
1011
1063
 
1012
1064
 
1013
1065
  class VideoFrame(DataModel):
datachain/lib/hf.py CHANGED
@@ -69,21 +69,25 @@ class HFGenerator(Generator):
69
69
  self,
70
70
  ds: Union[str, HFDatasetType],
71
71
  output_schema: type["BaseModel"],
72
+ limit: int = 0,
72
73
  *args,
73
74
  **kwargs,
74
75
  ):
75
76
  """
76
- Generator for chain from huggingface datasets.
77
+ Generator for chain from Hugging Face datasets.
77
78
 
78
79
  Parameters:
79
80
 
80
- ds : Path or name of the dataset to read from Hugging Face Hub,
81
- or an instance of `datasets.Dataset`-like object.
82
- output_schema : Pydantic model for validation.
81
+ ds : Path or name of the dataset to read from Hugging Face Hub,
82
+ or an instance of `datasets.Dataset`-like object.
83
+ limit : Limit the number of items to read from the HF dataset.
84
+ Defaults to 0 (no limit).
85
+ output_schema : Pydantic model for validation.
83
86
  """
84
87
  super().__init__()
85
88
  self.ds = ds
86
89
  self.output_schema = output_schema
90
+ self.limit = limit
87
91
  self.args = args
88
92
  self.kwargs = kwargs
89
93
 
@@ -93,6 +97,8 @@ class HFGenerator(Generator):
93
97
  def process(self, split: str = ""):
94
98
  desc = "Parsed Hugging Face dataset"
95
99
  ds = self.ds_dict[split]
100
+ if self.limit > 0:
101
+ ds = ds.take(self.limit)
96
102
  if split:
97
103
  desc += f" split '{split}'"
98
104
  model_fields = self.output_schema._model_fields_by_aliases() # type: ignore[attr-defined]
@@ -113,7 +119,6 @@ class HFGenerator(Generator):
113
119
 
114
120
  def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
115
121
  if isinstance(ds, str):
116
- kwargs["streaming"] = True
117
122
  ds = load_dataset(ds, *args, **kwargs)
118
123
  if isinstance(ds, (DatasetDict, IterableDatasetDict)):
119
124
  return ds
@@ -132,7 +137,12 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
132
137
  sfeat = feat[sname]
133
138
  norm_name, info = model_fields[sname]
134
139
  sanno = info.annotation
135
- sdict[norm_name] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
140
+ if isinstance(val[sname], list):
141
+ sdict[norm_name] = [
142
+ convert_feature(v, sfeat, sanno) for v in val[sname]
143
+ ]
144
+ else:
145
+ sdict[norm_name] = convert_feature(val[sname], sfeat, sanno)
136
146
  return anno(**sdict)
137
147
  if isinstance(feat, Image):
138
148
  if isinstance(val, dict):
@@ -174,7 +184,7 @@ def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
174
184
  for sname, sval in val.items():
175
185
  dtype = _feature_to_chain_type(sname, sval)
176
186
  sequence_dict[sname] = dtype # type: ignore[valid-type]
177
- return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
187
+ return dict_to_data_model(f"HFDataModel_{name}", sequence_dict) # type: ignore[arg-type]
178
188
  if isinstance(val, List):
179
189
  return list[_feature_to_chain_type(name, val.feature)] # type: ignore[arg-type,misc,return-value]
180
190
  if isinstance(val, Array2D):
datachain/lib/video.py CHANGED
@@ -205,7 +205,10 @@ def save_video_fragment(
205
205
  VideoFile: Video fragment model.
206
206
  """
207
207
  if start < 0 or end < 0 or start >= end:
208
- raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
208
+ raise ValueError(
209
+ f"Can't save video fragment for '{video.path}', "
210
+ f"invalid time range: ({start:.3f}, {end:.3f})"
211
+ )
209
212
 
210
213
  if format is None:
211
214
  format = video.get_file_ext()
@@ -429,6 +429,8 @@ class StudioClient:
429
429
  repository: Optional[str] = None,
430
430
  priority: Optional[int] = None,
431
431
  cluster: Optional[str] = None,
432
+ start_time: Optional[str] = None,
433
+ cron: Optional[str] = None,
432
434
  ) -> Response[JobData]:
433
435
  data = {
434
436
  "query": query,
@@ -442,6 +444,8 @@ class StudioClient:
442
444
  "repository": repository,
443
445
  "priority": priority,
444
446
  "compute_cluster_name": cluster,
447
+ "start_after": start_time,
448
+ "cron_expression": cron,
445
449
  }
446
450
  return self._send_request("datachain/job", data)
447
451
 
datachain/studio.py CHANGED
@@ -1,8 +1,10 @@
1
1
  import asyncio
2
2
  import os
3
3
  import sys
4
+ from datetime import datetime, timezone
4
5
  from typing import TYPE_CHECKING, Optional
5
6
 
7
+ import dateparser
6
8
  import tabulate
7
9
 
8
10
  from datachain.config import Config, ConfigLevel
@@ -42,6 +44,8 @@ def process_jobs_args(args: "Namespace"):
42
44
  args.req_file,
43
45
  args.priority,
44
46
  args.cluster,
47
+ args.start_time,
48
+ args.cron,
45
49
  )
46
50
 
47
51
  if args.cmd == "cancel":
@@ -262,6 +266,24 @@ def save_config(hostname, token, level=ConfigLevel.GLOBAL):
262
266
  return config.config_file()
263
267
 
264
268
 
269
+ def parse_start_time(start_time_str: Optional[str]) -> Optional[str]:
270
+ if not start_time_str:
271
+ return None
272
+
273
+ # Parse the datetime string using dateparser
274
+ parsed_datetime = dateparser.parse(start_time_str)
275
+
276
+ if parsed_datetime is None:
277
+ raise DataChainError(
278
+ f"Could not parse datetime string: '{start_time_str}'. "
279
+ f"Supported formats include: '2024-01-15 14:30:00', 'tomorrow 3pm', "
280
+ f"'monday 9am', '2024-01-15T14:30:00Z', 'in 2 hours', etc."
281
+ )
282
+
283
+ # Convert to ISO format string
284
+ return parsed_datetime.isoformat()
285
+
286
+
265
287
  def show_logs_from_client(client, job_id):
266
288
  # Sync usage
267
289
  async def _run():
@@ -310,6 +332,8 @@ def create_job(
310
332
  req_file: Optional[str] = None,
311
333
  priority: Optional[int] = None,
312
334
  cluster: Optional[str] = None,
335
+ start_time: Optional[str] = None,
336
+ cron: Optional[str] = None,
313
337
  ):
314
338
  query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
315
339
  with open(query_file) as f:
@@ -328,6 +352,11 @@ def create_job(
328
352
  client = StudioClient(team=team_name)
329
353
  file_ids = upload_files(client, files) if files else []
330
354
 
355
+ # Parse start_time if provided
356
+ parsed_start_time = parse_start_time(start_time)
357
+ if cron and parsed_start_time is None:
358
+ parsed_start_time = datetime.now(timezone.utc).isoformat()
359
+
331
360
  response = client.create_job(
332
361
  query=query,
333
362
  query_type=query_type,
@@ -340,6 +369,8 @@ def create_job(
340
369
  requirements=requirements,
341
370
  priority=priority,
342
371
  cluster=cluster,
372
+ start_time=parsed_start_time,
373
+ cron=cron,
343
374
  )
344
375
  if not response.ok:
345
376
  raise DataChainError(response.message)
@@ -348,6 +379,11 @@ def create_job(
348
379
  raise DataChainError("Failed to create job")
349
380
 
350
381
  job_id = response.data.get("job", {}).get("id")
382
+
383
+ if parsed_start_time or cron:
384
+ print(f"Job {job_id} is scheduled as a task in Studio.")
385
+ return 0
386
+
351
387
  print(f"Job {job_id} created")
352
388
  print("Open the job in Studio at", response.data.get("job", {}).get("url"))
353
389
  print("=" * 40)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.3
3
+ Version: 0.27.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -26,6 +26,7 @@ Requires-Dist: packaging
26
26
  Requires-Dist: pyarrow
27
27
  Requires-Dist: typing-extensions
28
28
  Requires-Dist: python-dateutil>=2
29
+ Requires-Dist: dateparser>=1.0.0
29
30
  Requires-Dist: attrs>=21.3.0
30
31
  Requires-Dist: fsspec>=2024.2.0
31
32
  Requires-Dist: s3fs>=2024.2.0
@@ -100,6 +101,7 @@ Provides-Extra: dev
100
101
  Requires-Dist: datachain[docs,tests]; extra == "dev"
101
102
  Requires-Dist: mypy==1.17.0; extra == "dev"
102
103
  Requires-Dist: types-python-dateutil; extra == "dev"
104
+ Requires-Dist: types-dateparser; extra == "dev"
103
105
  Requires-Dist: types-pytz; extra == "dev"
104
106
  Requires-Dist: types-PyYAML; extra == "dev"
105
107
  Requires-Dist: types-requests; extra == "dev"
@@ -118,7 +120,7 @@ Dynamic: license-file
118
120
  |logo| DataChain
119
121
  ================
120
122
 
121
- |PyPI| |Python Version| |Codecov| |Tests|
123
+ |PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
122
124
 
123
125
  .. |logo| image:: docs/assets/datachain.svg
124
126
  :height: 24
@@ -134,6 +136,9 @@ Dynamic: license-file
134
136
  .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
135
137
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
136
138
  :alt: Tests
139
+ .. |DeepWiki| image:: https://deepwiki.com/badge.svg
140
+ :target: https://deepwiki.com/iterative/datachain
141
+ :alt: DeepWiki
137
142
 
138
143
  DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
139
144
  data like images, audio, videos, text and PDFs. It integrates with external storage
@@ -17,7 +17,7 @@ datachain/project.py,sha256=90D4GpJSA3t0fayYZbzrL3sk4U7EJhQo8psnWvdI7_o,2280
17
17
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
19
19
  datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
20
- datachain/studio.py,sha256=bLok-eJNFRHQScEyAyA_Fas52dmijd5r-73KudWxV4k,13337
20
+ datachain/studio.py,sha256=RCpVZdHRX-ClEddXaAsZDGFy5o-SOqVCa5NhLj8337s,14486
21
21
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
22
  datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
23
23
  datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
@@ -35,7 +35,7 @@ datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibV
35
35
  datachain/cli/commands/query.py,sha256=Xzfgh14nPVH-sclqX1tpZqgfdTugw5s_44v0D33z6FA,1505
36
36
  datachain/cli/commands/show.py,sha256=Cf8wBs12h-xtdOzjU5GTDy2C8rF5HJSF0hDJYER1zH8,1606
37
37
  datachain/cli/parser/__init__.py,sha256=NPB6ssP4CCt7G1SWZ_8oNQEH2C1lktWgkyHYXDQJZNc,15073
38
- datachain/cli/parser/job.py,sha256=_wqOOxGRXG_-xuQ35FaLUOwjw6w8HviWvoEpZZ7VBzI,5289
38
+ datachain/cli/parser/job.py,sha256=iytBZaCcQUhaOcRlYZFeAJsscN2T2XcEY7MibTeuZhg,5786
39
39
  datachain/cli/parser/studio.py,sha256=Bo__LKM7qhJGgkyX8M_bCvgZ2Gvqq6r_X4t1NdtaBIY,3881
40
40
  datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
41
41
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
@@ -70,13 +70,13 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
70
70
  datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
71
71
  datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
72
72
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- datachain/lib/arrow.py,sha256=gMgmiMOhTGFMSyWBbjyzF2RsSXjx0XmUGPoSBxcWwe0,10756
74
- datachain/lib/audio.py,sha256=J7XJ14ItPF9y6pN-tmMV9In9X9rgwlBwzyzdGOUkPGk,4376
73
+ datachain/lib/arrow.py,sha256=geoLvyDd5uMqS3D9Ec1ODlShCUAdtwHUwl8FqbUX_hg,10776
74
+ datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
75
75
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
76
- datachain/lib/data_model.py,sha256=JPHPO6z-pehyiY-qNBAnp8u015xUHrijPKbGkMHS6lo,3493
76
+ datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
77
77
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
78
- datachain/lib/file.py,sha256=tHBBacsh1580UPFC6fAINBNwNiyymNgzj89rpsz1LKc,40817
79
- datachain/lib/hf.py,sha256=dadHs2dsi4ALwXz92Y3T7AUgq3wQF4mBydWqHCMjvks,6880
78
+ datachain/lib/file.py,sha256=_ch7xYcpl0kzImgEwccbQ-a5qb9rbEvx1vcuWerOn9k,42608
79
+ datachain/lib/hf.py,sha256=3xdvPQPilnJiGv3H4S4bTGqvrGGlZgZmqjE1n_SMJZg,7293
80
80
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
81
81
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
82
82
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
@@ -92,7 +92,7 @@ datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
92
92
  datachain/lib/udf.py,sha256=SUnJWRDC3TlLhvpi8iqqJbeZGn5DChot7DyH-0Q-z20,17305
93
93
  datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
94
94
  datachain/lib/utils.py,sha256=rG2y7NwTqZOuomZZRmrA-Q-ANM_j1cToQYqDJoOeGyU,1480
95
- datachain/lib/video.py,sha256=u6fLJWj5G6QqsVkpfHnKGklBNpG3BRRg6v3izngnNcU,6767
95
+ datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
96
96
  datachain/lib/webdataset.py,sha256=CkW8FfGigNx6wo2EEK4KMjhEE8FamRHWGs2HZuH7jDY,7214
97
97
  datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
98
98
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -104,15 +104,15 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
104
104
  datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
105
105
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
106
106
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
107
- datachain/lib/dc/datachain.py,sha256=ap54lcuj71tvp0zX1jiFFiEWvA5UPeyYJRJkd2APmlI,92897
107
+ datachain/lib/dc/datachain.py,sha256=mLE5v4KhzEQm7HVWBTxY6EwJ2J-YeFVcLUY4I21216c,93212
108
108
  datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
109
- datachain/lib/dc/hf.py,sha256=MJWO-NL4jAD6CEAmXsyeqXEyvefRLMhyxhT9jKT5vMU,2324
109
+ datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
110
110
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
111
111
  datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
112
112
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
113
113
  datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
114
114
  datachain/lib/dc/records.py,sha256=FpPbApWopUri1gIaSMsfXN4fevja4mjmfb6Q5eiaGxI,3116
115
- datachain/lib/dc/storage.py,sha256=8xiV3c6k-sG14RGwNJCp0AbV6L0mNDsTVZ-Est-ccnw,7672
115
+ datachain/lib/dc/storage.py,sha256=FXroEdxOZfbuEBIWfWTkbGwrI0D4_mrLZSRsIQm0WFE,7693
116
116
  datachain/lib/dc/utils.py,sha256=VawOAlJSvAtZbsMg33s5tJe21TRx1Km3QggI1nN6tnw,3984
117
117
  datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
118
118
  datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -136,7 +136,7 @@ datachain/query/session.py,sha256=gKblltJAVQAVSTswAgWGDgGbpmFlFzFVkIQojDCjgXM,68
136
136
  datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
137
137
  datachain/query/utils.py,sha256=HaSDNH_XGvp_NIcXjcB7j4vJRPi4_tbztDWclYelHY4,1208
138
138
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- datachain/remote/studio.py,sha256=oJp2KD9eO8zQDnPfNpAALZYsOlBfqVKKRTeCkEpcsYk,15196
139
+ datachain/remote/studio.py,sha256=vsuqCAO65PBJKGLMxOvc3Bmieo2TJwcfc9YclxkzmFk,15350
140
140
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
141
141
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
142
142
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -158,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
158
158
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
159
159
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
160
160
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
161
- datachain-0.26.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
- datachain-0.26.3.dist-info/METADATA,sha256=HdG_quEq0rfrdKJJ_teSViVCXbXI3SxLlnh6tu2Mgfs,13543
163
- datachain-0.26.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- datachain-0.26.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
- datachain-0.26.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
- datachain-0.26.3.dist-info/RECORD,,
161
+ datachain-0.27.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
+ datachain-0.27.0.dist-info/METADATA,sha256=PWZ_EWTpk1OvWlQZe__5SCjFem6BD1AtYmTxJ5wV3iY,13759
163
+ datachain-0.27.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ datachain-0.27.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
+ datachain-0.27.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
+ datachain-0.27.0.dist-info/RECORD,,