datachain 0.26.4__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -20,8 +20,8 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
20
20
  studio_run_description = "Run a job in Studio. \n"
21
21
  studio_run_description += (
22
22
  "When using --start-time or --cron,"
23
- " the job is scheduled as a task and will not show logs immediately."
24
- " The job will be executed according to the schedule."
23
+ " the job is scheduled to run but won't start immediately"
24
+ " (can be seen in the Tasks tab in UI)"
25
25
  )
26
26
 
27
27
  studio_run_parser = jobs_subparser.add_parser(
@@ -104,11 +104,16 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
104
104
  studio_run_parser.add_argument(
105
105
  "--start-time",
106
106
  action="store",
107
- help="Start time in ISO format or natural language for the cron task.",
107
+ help="Time to schedule a task in YYYY-MM-DDTHH:mm format or natural language.",
108
108
  )
109
109
  studio_run_parser.add_argument(
110
110
  "--cron", action="store", help="Cron expression for the cron task."
111
111
  )
112
+ studio_run_parser.add_argument(
113
+ "--no-wait",
114
+ action="store_true",
115
+ help="Do not wait for the job to finish",
116
+ )
112
117
 
113
118
  studio_ls_help = "List jobs in Studio"
114
119
  studio_ls_description = "List jobs in Studio."
@@ -12,10 +12,11 @@ class JobStatus(int, Enum):
12
12
  CANCELING = 7
13
13
  CANCELED = 8
14
14
  CANCELING_SCHEDULED = 9
15
+ TASK = 11
15
16
 
16
17
  @classmethod
17
18
  def finished(cls) -> tuple[int, ...]:
18
- return cls.COMPLETE, cls.FAILED, cls.CANCELED
19
+ return cls.COMPLETE, cls.FAILED, cls.CANCELED, cls.TASK
19
20
 
20
21
 
21
22
  class JobQueryType(int, Enum):
datachain/lib/arrow.py CHANGED
@@ -245,7 +245,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
245
245
  if field.nullable and not ModelStore.is_pydantic(dtype):
246
246
  dtype = Optional[dtype] # type: ignore[assignment]
247
247
  type_dict[field.name] = dtype
248
- return dict_to_data_model(column, type_dict)
248
+ return dict_to_data_model(f"ArrowDataModel_{column}", type_dict)
249
249
  if pa.types.is_map(col_type):
250
250
  return dict
251
251
  if isinstance(col_type, pa.lib.DictionaryType):
datachain/lib/audio.py CHANGED
@@ -33,10 +33,14 @@ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
33
33
  frames = int(info.num_frames)
34
34
  duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
35
35
 
36
- # Get format information
37
- format_name = getattr(info, "format", "")
38
36
  codec_name = getattr(info, "encoding", "")
39
- bit_rate = getattr(info, "bits_per_sample", 0) * sample_rate * channels
37
+ file_ext = file.get_file_ext().lower()
38
+ format_name = _encoding_to_format(codec_name, file_ext)
39
+
40
+ bits_per_sample = getattr(info, "bits_per_sample", 0)
41
+ bit_rate = (
42
+ bits_per_sample * sample_rate * channels if bits_per_sample > 0 else -1
43
+ )
40
44
 
41
45
  except Exception as exc:
42
46
  raise FileError(
@@ -54,7 +58,47 @@ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
54
58
  )
55
59
 
56
60
 
57
- def audio_fragment_np(
61
+ def _encoding_to_format(encoding: str, file_ext: str) -> str:
62
+ """
63
+ Map torchaudio encoding to a format name.
64
+
65
+ Args:
66
+ encoding: The encoding string from torchaudio.info()
67
+ file_ext: The file extension as a fallback
68
+
69
+ Returns:
70
+ Format name as a string
71
+ """
72
+ # Direct mapping for formats that match exactly
73
+ encoding_map = {
74
+ "FLAC": "flac",
75
+ "MP3": "mp3",
76
+ "VORBIS": "ogg",
77
+ "AMR_WB": "amr",
78
+ "AMR_NB": "amr",
79
+ "OPUS": "opus",
80
+ "GSM": "gsm",
81
+ }
82
+
83
+ if encoding in encoding_map:
84
+ return encoding_map[encoding]
85
+
86
+ # For PCM variants, use file extension to determine format
87
+ if encoding.startswith("PCM_"):
88
+ # Common PCM formats by extension
89
+ pcm_formats = {
90
+ "wav": "wav",
91
+ "aiff": "aiff",
92
+ "au": "au",
93
+ "raw": "raw",
94
+ }
95
+ return pcm_formats.get(file_ext, "wav") # Default to wav for PCM
96
+
97
+ # Fallback to file extension if encoding is unknown
98
+ return file_ext if file_ext else "unknown"
99
+
100
+
101
+ def audio_to_np(
58
102
  audio: "AudioFile", start: float = 0, duration: Optional[float] = None
59
103
  ) -> "tuple[ndarray, int]":
60
104
  """Load audio fragment as numpy array.
@@ -98,14 +142,17 @@ def audio_fragment_np(
98
142
  ) from exc
99
143
 
100
144
 
101
- def audio_fragment_bytes(
145
+ def audio_to_bytes(
102
146
  audio: "AudioFile",
147
+ format: str = "wav",
103
148
  start: float = 0,
104
149
  duration: Optional[float] = None,
105
- format: str = "wav",
106
150
  ) -> bytes:
107
- """Convert audio fragment to bytes using soundfile."""
108
- y, sr = audio_fragment_np(audio, start, duration)
151
+ """Convert audio to bytes using soundfile.
152
+
153
+ If duration is None, converts from start to end of file.
154
+ If start is 0 and duration is None, converts entire file."""
155
+ y, sr = audio_to_np(audio, start, duration)
109
156
 
110
157
  import io
111
158
 
@@ -116,36 +163,82 @@ def audio_fragment_bytes(
116
163
  return buffer.getvalue()
117
164
 
118
165
 
119
- def save_audio_fragment(
166
+ def save_audio(
120
167
  audio: "AudioFile",
121
- start: float,
122
- end: float,
123
168
  output: str,
124
169
  format: Optional[str] = None,
170
+ start: float = 0,
171
+ end: Optional[float] = None,
125
172
  ) -> "AudioFile":
126
- """Save audio fragment with timestamped filename.
127
- Supports local and remote storage upload."""
128
- if start < 0 or end < 0 or start >= end:
129
- raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
130
-
173
+ """Save audio file or extract fragment to specified format.
174
+
175
+ Args:
176
+ audio: Source AudioFile object
177
+ output: Output directory path
178
+ format: Output format ('wav', 'mp3', etc). Defaults to source format
179
+ start: Start time in seconds (>= 0). Defaults to 0
180
+ end: End time in seconds. If None, extracts to end of file
181
+
182
+ Returns:
183
+ AudioFile: New audio file with format conversion/extraction applied
184
+
185
+ Examples:
186
+ save_audio(audio, "/path", "mp3") # Entire file to MP3
187
+ save_audio(audio, "s3://bucket/path", "wav", start=2.5) # From 2.5s to end
188
+ save_audio(audio, "/path", "flac", start=1, end=3) # Extract 1-3s fragment
189
+ """
131
190
  if format is None:
132
191
  format = audio.get_file_ext()
133
192
 
134
- duration = end - start
135
- start_ms = int(start * 1000)
136
- end_ms = int(end * 1000)
137
- output_file = posixpath.join(
138
- output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
139
- )
193
+ # Validate start time
194
+ if start < 0:
195
+ raise ValueError(
196
+ f"Can't save audio for '{audio.path}', "
197
+ f"start time must be non-negative: {start:.3f}"
198
+ )
199
+
200
+ # Handle full file conversion when end is None and start is 0
201
+ if end is None and start == 0:
202
+ output_file = posixpath.join(output, f"{audio.get_file_stem()}.{format}")
203
+ try:
204
+ audio_bytes = audio_to_bytes(audio, format, start=0, duration=None)
205
+ except Exception as exc:
206
+ raise FileError(
207
+ "unable to convert audio file", audio.source, audio.path
208
+ ) from exc
209
+ elif end is None:
210
+ # Extract from start to end of file
211
+ output_file = posixpath.join(
212
+ output, f"{audio.get_file_stem()}_{int(start * 1000):06d}_end.{format}"
213
+ )
214
+ try:
215
+ audio_bytes = audio_to_bytes(audio, format, start=start, duration=None)
216
+ except Exception as exc:
217
+ raise FileError(
218
+ "unable to save audio fragment", audio.source, audio.path
219
+ ) from exc
220
+ else:
221
+ # Fragment extraction mode with specific end time
222
+ if end < 0 or start >= end:
223
+ raise ValueError(
224
+ f"Can't save audio for '{audio.path}', "
225
+ f"invalid time range: ({start:.3f}, {end:.3f})"
226
+ )
140
227
 
141
- try:
142
- audio_bytes = audio_fragment_bytes(audio, start, duration, format)
228
+ duration = end - start
229
+ start_ms = int(start * 1000)
230
+ end_ms = int(end * 1000)
231
+ output_file = posixpath.join(
232
+ output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
233
+ )
143
234
 
144
- from datachain.lib.file import AudioFile
235
+ try:
236
+ audio_bytes = audio_to_bytes(audio, format, start, duration)
237
+ except Exception as exc:
238
+ raise FileError(
239
+ "unable to save audio fragment", audio.source, audio.path
240
+ ) from exc
145
241
 
146
- return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
242
+ from datachain.lib.file import AudioFile
147
243
 
148
- except Exception as exc:
149
- raise FileError(
150
- "unable to save audio fragment", audio.source, audio.path
151
- ) from exc
244
+ return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
@@ -1,3 +1,5 @@
1
+ import inspect
2
+ import uuid
1
3
  from collections.abc import Sequence
2
4
  from datetime import datetime
3
5
  from typing import ClassVar, Optional, Union, get_args, get_origin
@@ -80,7 +82,9 @@ def dict_to_data_model(
80
82
 
81
83
  fields = {
82
84
  name: (
83
- anno,
85
+ anno
86
+ if inspect.isclass(anno) and issubclass(anno, BaseModel)
87
+ else Optional[anno],
84
88
  Field(
85
89
  validation_alias=AliasChoices(name, original_names[idx] or name),
86
90
  default=None,
@@ -101,6 +105,10 @@ def dict_to_data_model(
101
105
  field_info[str(alias)] = (_name, field)
102
106
  return field_info
103
107
 
108
+ # Generate random unique name if not provided
109
+ if not name:
110
+ name = f"DataModel_{uuid.uuid4().hex[:8]}"
111
+
104
112
  return create_model(
105
113
  name,
106
114
  __base__=_DataModelStrict,
datachain/lib/dc/hf.py CHANGED
@@ -25,19 +25,23 @@ def read_hf(
25
25
  settings: Optional[dict] = None,
26
26
  column: str = "",
27
27
  model_name: str = "",
28
+ limit: int = 0,
28
29
  **kwargs,
29
30
  ) -> "DataChain":
30
- """Generate chain from huggingface hub dataset.
31
+ """Generate chain from Hugging Face Hub dataset.
31
32
 
32
33
  Parameters:
33
34
  dataset : Path or name of the dataset to read from Hugging Face Hub,
34
35
  or an instance of `datasets.Dataset`-like object.
35
- args : Additional positional arguments to pass to datasets.load_dataset.
36
+ args : Additional positional arguments to pass to `datasets.load_dataset`.
36
37
  session : Session to use for the chain.
37
38
  settings : Settings to use for the chain.
38
39
  column : Generated object column name.
39
40
  model_name : Generated model name.
40
- kwargs : Parameters to pass to datasets.load_dataset.
41
+ limit : Limit the number of items to read from the HF dataset.
42
+ Adds `take(limit)` to the `datasets.load_dataset`.
43
+ Defaults to 0 (no limit).
44
+ kwargs : Parameters to pass to `datasets.load_dataset`.
41
45
 
42
46
  Example:
43
47
  Load from Hugging Face Hub:
@@ -53,6 +57,18 @@ def read_hf(
53
57
  import datachain as dc
54
58
  chain = dc.read_hf(ds)
55
59
  ```
60
+
61
+ Streaming with limit, for large datasets:
62
+ ```py
63
+ import datachain as dc
64
+ ds = dc.read_hf("beans", split="train", streaming=True, limit=10)
65
+ ```
66
+
67
+ or use HF split syntax (not supported if streaming is enabled):
68
+ ```py
69
+ import datachain as dc
70
+ ds = dc.read_hf("beans", split="train[%10]")
71
+ ```
56
72
  """
57
73
  from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
58
74
 
@@ -72,4 +88,4 @@ def read_hf(
72
88
  output = {column: model}
73
89
 
74
90
  chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
75
- return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
91
+ return chain.gen(HFGenerator(dataset, model, limit, *args, **kwargs), output=output)
datachain/lib/file.py CHANGED
@@ -832,7 +832,10 @@ class VideoFile(File):
832
832
  VideoFragment: A Model representing the video fragment.
833
833
  """
834
834
  if start < 0 or end < 0 or start >= end:
835
- raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
835
+ raise ValueError(
836
+ f"Can't get video fragment for '{self.path}', "
837
+ f"invalid time range: ({start:.3f}, {end:.3f})"
838
+ )
836
839
 
837
840
  return VideoFragment(video=self, start=start, end=end)
838
841
 
@@ -915,7 +918,10 @@ class AudioFile(File):
915
918
  AudioFragment: A Model representing the audio fragment.
916
919
  """
917
920
  if start < 0 or end < 0 or start >= end:
918
- raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
921
+ raise ValueError(
922
+ f"Can't get audio fragment for '{self.path}', "
923
+ f"invalid time range: ({start:.3f}, {end:.3f})"
924
+ )
919
925
 
920
926
  return AudioFragment(audio=self, start=start, end=end)
921
927
 
@@ -958,6 +964,35 @@ class AudioFile(File):
958
964
  yield self.get_fragment(start, min(start + duration, end))
959
965
  start += duration
960
966
 
967
+ def save( # type: ignore[override]
968
+ self,
969
+ output: str,
970
+ format: Optional[str] = None,
971
+ start: float = 0,
972
+ end: Optional[float] = None,
973
+ client_config: Optional[dict] = None,
974
+ ) -> "AudioFile":
975
+ """Save audio file or extract fragment to specified format.
976
+
977
+ Args:
978
+ output: Output directory path
979
+ format: Output format ('wav', 'mp3', etc). Defaults to source format
980
+ start: Start time in seconds (>= 0). Defaults to 0
981
+ end: End time in seconds. If None, extracts to end of file
982
+ client_config: Optional client configuration
983
+
984
+ Returns:
985
+ AudioFile: New audio file with format conversion/extraction applied
986
+
987
+ Examples:
988
+ audio.save("/path", "mp3") # Entire file to MP3
989
+ audio.save("s3://bucket/path", "wav", start=2.5) # From 2.5s to end as WAV
990
+ audio.save("/path", "flac", start=1, end=3) # 1-3s fragment as FLAC
991
+ """
992
+ from .audio import save_audio
993
+
994
+ return save_audio(self, output, format, start, end)
995
+
961
996
 
962
997
  class AudioFragment(DataModel):
963
998
  """
@@ -985,10 +1020,10 @@ class AudioFragment(DataModel):
985
1020
  tuple[ndarray, int]: A tuple containing the audio data as a NumPy array
986
1021
  and the sample rate.
987
1022
  """
988
- from .audio import audio_fragment_np
1023
+ from .audio import audio_to_np
989
1024
 
990
1025
  duration = self.end - self.start
991
- return audio_fragment_np(self.audio, self.start, duration)
1026
+ return audio_to_np(self.audio, self.start, duration)
992
1027
 
993
1028
  def read_bytes(self, format: str = "wav") -> bytes:
994
1029
  """
@@ -1001,10 +1036,10 @@ class AudioFragment(DataModel):
1001
1036
  Returns:
1002
1037
  bytes: The encoded audio fragment as bytes.
1003
1038
  """
1004
- from .audio import audio_fragment_bytes
1039
+ from .audio import audio_to_bytes
1005
1040
 
1006
1041
  duration = self.end - self.start
1007
- return audio_fragment_bytes(self.audio, self.start, duration, format)
1042
+ return audio_to_bytes(self.audio, format, self.start, duration)
1008
1043
 
1009
1044
  def save(self, output: str, format: Optional[str] = None) -> "AudioFile":
1010
1045
  """
@@ -1022,9 +1057,9 @@ class AudioFragment(DataModel):
1022
1057
  Returns:
1023
1058
  AudioFile: A Model representing the saved audio file.
1024
1059
  """
1025
- from .audio import save_audio_fragment
1060
+ from .audio import save_audio
1026
1061
 
1027
- return save_audio_fragment(self.audio, self.start, self.end, output, format)
1062
+ return save_audio(self.audio, output, format, self.start, self.end)
1028
1063
 
1029
1064
 
1030
1065
  class VideoFrame(DataModel):
datachain/lib/hf.py CHANGED
@@ -69,21 +69,25 @@ class HFGenerator(Generator):
69
69
  self,
70
70
  ds: Union[str, HFDatasetType],
71
71
  output_schema: type["BaseModel"],
72
+ limit: int = 0,
72
73
  *args,
73
74
  **kwargs,
74
75
  ):
75
76
  """
76
- Generator for chain from huggingface datasets.
77
+ Generator for chain from Hugging Face datasets.
77
78
 
78
79
  Parameters:
79
80
 
80
- ds : Path or name of the dataset to read from Hugging Face Hub,
81
- or an instance of `datasets.Dataset`-like object.
82
- output_schema : Pydantic model for validation.
81
+ ds : Path or name of the dataset to read from Hugging Face Hub,
82
+ or an instance of `datasets.Dataset`-like object.
83
+ limit : Limit the number of items to read from the HF dataset.
84
+ Defaults to 0 (no limit).
85
+ output_schema : Pydantic model for validation.
83
86
  """
84
87
  super().__init__()
85
88
  self.ds = ds
86
89
  self.output_schema = output_schema
90
+ self.limit = limit
87
91
  self.args = args
88
92
  self.kwargs = kwargs
89
93
 
@@ -93,6 +97,8 @@ class HFGenerator(Generator):
93
97
  def process(self, split: str = ""):
94
98
  desc = "Parsed Hugging Face dataset"
95
99
  ds = self.ds_dict[split]
100
+ if self.limit > 0:
101
+ ds = ds.take(self.limit)
96
102
  if split:
97
103
  desc += f" split '{split}'"
98
104
  model_fields = self.output_schema._model_fields_by_aliases() # type: ignore[attr-defined]
@@ -113,7 +119,6 @@ class HFGenerator(Generator):
113
119
 
114
120
  def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
115
121
  if isinstance(ds, str):
116
- kwargs["streaming"] = True
117
122
  ds = load_dataset(ds, *args, **kwargs)
118
123
  if isinstance(ds, (DatasetDict, IterableDatasetDict)):
119
124
  return ds
@@ -132,7 +137,12 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
132
137
  sfeat = feat[sname]
133
138
  norm_name, info = model_fields[sname]
134
139
  sanno = info.annotation
135
- sdict[norm_name] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
140
+ if isinstance(val[sname], list):
141
+ sdict[norm_name] = [
142
+ convert_feature(v, sfeat, sanno) for v in val[sname]
143
+ ]
144
+ else:
145
+ sdict[norm_name] = convert_feature(val[sname], sfeat, sanno)
136
146
  return anno(**sdict)
137
147
  if isinstance(feat, Image):
138
148
  if isinstance(val, dict):
@@ -174,7 +184,7 @@ def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
174
184
  for sname, sval in val.items():
175
185
  dtype = _feature_to_chain_type(sname, sval)
176
186
  sequence_dict[sname] = dtype # type: ignore[valid-type]
177
- return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
187
+ return dict_to_data_model(f"HFDataModel_{name}", sequence_dict) # type: ignore[arg-type]
178
188
  if isinstance(val, List):
179
189
  return list[_feature_to_chain_type(name, val.feature)] # type: ignore[arg-type,misc,return-value]
180
190
  if isinstance(val, Array2D):
datachain/lib/video.py CHANGED
@@ -205,7 +205,10 @@ def save_video_fragment(
205
205
  VideoFile: Video fragment model.
206
206
  """
207
207
  if start < 0 or end < 0 or start >= end:
208
- raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
208
+ raise ValueError(
209
+ f"Can't save video fragment for '{video.path}', "
210
+ f"invalid time range: ({start:.3f}, {end:.3f})"
211
+ )
209
212
 
210
213
  if format is None:
211
214
  format = video.get_file_ext()
datachain/studio.py CHANGED
@@ -8,6 +8,7 @@ import dateparser
8
8
  import tabulate
9
9
 
10
10
  from datachain.config import Config, ConfigLevel
11
+ from datachain.data_storage.job import JobStatus
11
12
  from datachain.dataset import QUERY_DATASET_PREFIX, parse_dataset_name
12
13
  from datachain.error import DataChainError
13
14
  from datachain.remote.studio import StudioClient
@@ -20,6 +21,8 @@ POST_LOGIN_MESSAGE = (
20
21
  "Once you've logged in, return here "
21
22
  "and you'll be ready to start using DataChain with Studio."
22
23
  )
24
+ RETRY_MAX_TIMES = 10
25
+ RETRY_SLEEP_SEC = 1
23
26
 
24
27
 
25
28
  def process_jobs_args(args: "Namespace"):
@@ -46,6 +49,7 @@ def process_jobs_args(args: "Namespace"):
46
49
  args.cluster,
47
50
  args.start_time,
48
51
  args.cron,
52
+ args.no_wait,
49
53
  )
50
54
 
51
55
  if args.cmd == "cancel":
@@ -270,41 +274,51 @@ def parse_start_time(start_time_str: Optional[str]) -> Optional[str]:
270
274
  if not start_time_str:
271
275
  return None
272
276
 
273
- try:
274
- # Parse the datetime string using dateparser
275
- parsed_datetime = dateparser.parse(start_time_str)
276
-
277
- if parsed_datetime is None:
278
- raise DataChainError(
279
- f"Could not parse datetime string: '{start_time_str}'. "
280
- f"Supported formats include: '2024-01-15 14:30:00', 'tomorrow 3pm', "
281
- f"'monday 9am', '2024-01-15T14:30:00Z', 'in 2 hours', etc."
282
- )
277
+ # Parse the datetime string using dateparser
278
+ parsed_datetime = dateparser.parse(start_time_str)
283
279
 
284
- # Convert to ISO format string
285
- return parsed_datetime.isoformat()
286
- except Exception as e:
280
+ if parsed_datetime is None:
287
281
  raise DataChainError(
288
- f"Invalid datetime format for start_time: '{start_time_str}'. "
282
+ f"Could not parse datetime string: '{start_time_str}'. "
289
283
  f"Supported formats include: '2024-01-15 14:30:00', 'tomorrow 3pm', "
290
- f"'monday 9am', '2024-01-15T14:30:00Z', 'in 2 hours', etc. Error: {e}"
291
- ) from e
284
+ f"'monday 9am', '2024-01-15T14:30:00Z', 'in 2 hours', etc."
285
+ )
286
+
287
+ # Convert to ISO format string
288
+ return parsed_datetime.isoformat()
292
289
 
293
290
 
294
291
  def show_logs_from_client(client, job_id):
295
292
  # Sync usage
296
293
  async def _run():
294
+ retry_count = 0
297
295
  latest_status = None
298
- async for message in client.tail_job_logs(job_id):
299
- if "logs" in message:
300
- for log in message["logs"]:
301
- print(log["message"], end="")
302
- elif "job" in message:
303
- latest_status = message["job"]["status"]
304
- print(f"\n>>>> Job is now in {latest_status} status.")
296
+ processed_statuses = set()
297
+ while True:
298
+ async for message in client.tail_job_logs(job_id):
299
+ if "logs" in message:
300
+ for log in message["logs"]:
301
+ print(log["message"], end="")
302
+ elif "job" in message:
303
+ latest_status = message["job"]["status"]
304
+ if latest_status in processed_statuses:
305
+ continue
306
+ processed_statuses.add(latest_status)
307
+ print(f"\n>>>> Job is now in {latest_status} status.")
308
+
309
+ try:
310
+ if retry_count > RETRY_MAX_TIMES or (
311
+ latest_status and JobStatus[latest_status].finished()
312
+ ):
313
+ break
314
+ await asyncio.sleep(RETRY_SLEEP_SEC)
315
+ retry_count += 1
316
+ except KeyError:
317
+ pass
318
+
305
319
  return latest_status
306
320
 
307
- latest_status = asyncio.run(_run())
321
+ final_status = asyncio.run(_run())
308
322
 
309
323
  response = client.dataset_job_versions(job_id)
310
324
  if not response.ok:
@@ -321,9 +335,9 @@ def show_logs_from_client(client, job_id):
321
335
 
322
336
  exit_code_by_status = {
323
337
  "FAILED": 1,
324
- "CANCELLED": 2,
338
+ "CANCELED": 2,
325
339
  }
326
- return exit_code_by_status.get(latest_status.upper(), 0) if latest_status else 0
340
+ return exit_code_by_status.get(final_status.upper(), 0) if final_status else 0
327
341
 
328
342
 
329
343
  def create_job(
@@ -341,6 +355,7 @@ def create_job(
341
355
  cluster: Optional[str] = None,
342
356
  start_time: Optional[str] = None,
343
357
  cron: Optional[str] = None,
358
+ no_wait: Optional[bool] = False,
344
359
  ):
345
360
  query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
346
361
  with open(query_file) as f:
@@ -395,7 +410,7 @@ def create_job(
395
410
  print("Open the job in Studio at", response.data.get("job", {}).get("url"))
396
411
  print("=" * 40)
397
412
 
398
- return show_logs_from_client(client, job_id)
413
+ return 0 if no_wait else show_logs_from_client(client, job_id)
399
414
 
400
415
 
401
416
  def upload_files(client: StudioClient, files: list[str]) -> list[str]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.4
3
+ Version: 0.28.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -45,7 +45,7 @@ Requires-Dist: datamodel-code-generator>=0.25
45
45
  Requires-Dist: Pillow<12,>=10.0.0
46
46
  Requires-Dist: msgpack<2,>=1.0.4
47
47
  Requires-Dist: psutil
48
- Requires-Dist: huggingface_hub
48
+ Requires-Dist: huggingface_hub<0.34.0
49
49
  Requires-Dist: iterative-telemetry>=0.0.10
50
50
  Requires-Dist: platformdirs
51
51
  Requires-Dist: dvc-studio-client<1,>=0.21
@@ -120,7 +120,7 @@ Dynamic: license-file
120
120
  |logo| DataChain
121
121
  ================
122
122
 
123
- |PyPI| |Python Version| |Codecov| |Tests|
123
+ |PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
124
124
 
125
125
  .. |logo| image:: docs/assets/datachain.svg
126
126
  :height: 24
@@ -136,6 +136,9 @@ Dynamic: license-file
136
136
  .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
137
137
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
138
138
  :alt: Tests
139
+ .. |DeepWiki| image:: https://deepwiki.com/badge.svg
140
+ :target: https://deepwiki.com/iterative/datachain
141
+ :alt: DeepWiki
139
142
 
140
143
  DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
141
144
  data like images, audio, videos, text and PDFs. It integrates with external storage
@@ -17,7 +17,7 @@ datachain/project.py,sha256=90D4GpJSA3t0fayYZbzrL3sk4U7EJhQo8psnWvdI7_o,2280
17
17
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
19
19
  datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
20
- datachain/studio.py,sha256=w5RyntqSl6qOs2mbw4Dc7SpZNNEN97xpvjxfJL0rO7M,14850
20
+ datachain/studio.py,sha256=-BmKLVNBLPFveUgVVE2So3aaiGndO2jK2qbHZ0zBDd8,15239
21
21
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
22
  datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
23
23
  datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
@@ -35,7 +35,7 @@ datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibV
35
35
  datachain/cli/commands/query.py,sha256=Xzfgh14nPVH-sclqX1tpZqgfdTugw5s_44v0D33z6FA,1505
36
36
  datachain/cli/commands/show.py,sha256=Cf8wBs12h-xtdOzjU5GTDy2C8rF5HJSF0hDJYER1zH8,1606
37
37
  datachain/cli/parser/__init__.py,sha256=NPB6ssP4CCt7G1SWZ_8oNQEH2C1lktWgkyHYXDQJZNc,15073
38
- datachain/cli/parser/job.py,sha256=2_g46bx_p7DnqZoYsXY2rHlB07BjBCuRPzpGP-Duk-s,5804
38
+ datachain/cli/parser/job.py,sha256=g6ozI3pnV0ly79L7M9mikCeYTPgKlG5gR0D144R82tk,5928
39
39
  datachain/cli/parser/studio.py,sha256=Bo__LKM7qhJGgkyX8M_bCvgZ2Gvqq6r_X4t1NdtaBIY,3881
40
40
  datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
41
41
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
@@ -48,7 +48,7 @@ datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,460
48
48
  datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
49
49
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
50
50
  datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
51
- datachain/data_storage/job.py,sha256=9r0OGwh22bHNIvLHqg8_-eJSP1YYB-BN5HOla5TdCxw,402
51
+ datachain/data_storage/job.py,sha256=ZkeXCNUj_VCkoKYx29hqB4AcfVUielnRjY-GYUcUxt4,426
52
52
  datachain/data_storage/metastore.py,sha256=Qw332arvhgXB4UY0yX-Hu8Vgl3smU12l6bvxrL9Q-vo,53810
53
53
  datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
54
54
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
@@ -70,13 +70,13 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
70
70
  datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
71
71
  datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
72
72
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- datachain/lib/arrow.py,sha256=gMgmiMOhTGFMSyWBbjyzF2RsSXjx0XmUGPoSBxcWwe0,10756
74
- datachain/lib/audio.py,sha256=J7XJ14ItPF9y6pN-tmMV9In9X9rgwlBwzyzdGOUkPGk,4376
73
+ datachain/lib/arrow.py,sha256=geoLvyDd5uMqS3D9Ec1ODlShCUAdtwHUwl8FqbUX_hg,10776
74
+ datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
75
75
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
76
- datachain/lib/data_model.py,sha256=JPHPO6z-pehyiY-qNBAnp8u015xUHrijPKbGkMHS6lo,3493
76
+ datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
77
77
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
78
- datachain/lib/file.py,sha256=vlSFsmj0ltvQWG6_isfWwNZt5u002bwrl70J2KbdvDE,41335
79
- datachain/lib/hf.py,sha256=dadHs2dsi4ALwXz92Y3T7AUgq3wQF4mBydWqHCMjvks,6880
78
+ datachain/lib/file.py,sha256=_ch7xYcpl0kzImgEwccbQ-a5qb9rbEvx1vcuWerOn9k,42608
79
+ datachain/lib/hf.py,sha256=3xdvPQPilnJiGv3H4S4bTGqvrGGlZgZmqjE1n_SMJZg,7293
80
80
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
81
81
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
82
82
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
@@ -92,7 +92,7 @@ datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
92
92
  datachain/lib/udf.py,sha256=SUnJWRDC3TlLhvpi8iqqJbeZGn5DChot7DyH-0Q-z20,17305
93
93
  datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
94
94
  datachain/lib/utils.py,sha256=rG2y7NwTqZOuomZZRmrA-Q-ANM_j1cToQYqDJoOeGyU,1480
95
- datachain/lib/video.py,sha256=u6fLJWj5G6QqsVkpfHnKGklBNpG3BRRg6v3izngnNcU,6767
95
+ datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
96
96
  datachain/lib/webdataset.py,sha256=CkW8FfGigNx6wo2EEK4KMjhEE8FamRHWGs2HZuH7jDY,7214
97
97
  datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
98
98
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -106,7 +106,7 @@ datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
106
106
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
107
107
  datachain/lib/dc/datachain.py,sha256=mLE5v4KhzEQm7HVWBTxY6EwJ2J-YeFVcLUY4I21216c,93212
108
108
  datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
109
- datachain/lib/dc/hf.py,sha256=MJWO-NL4jAD6CEAmXsyeqXEyvefRLMhyxhT9jKT5vMU,2324
109
+ datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
110
110
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
111
111
  datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
112
112
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
@@ -158,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
158
158
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
159
159
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
160
160
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
161
- datachain-0.26.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
- datachain-0.26.4.dist-info/METADATA,sha256=oWaaj_Avr95dDdM_txeheiOefsoHuXTu0QR71hTN634,13624
163
- datachain-0.26.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- datachain-0.26.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
- datachain-0.26.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
- datachain-0.26.4.dist-info/RECORD,,
161
+ datachain-0.28.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
+ datachain-0.28.0.dist-info/METADATA,sha256=lA3lv9RX2NeQPobrEjoEbAwg5K3zmnAnbDJ_hjR8KLw,13766
163
+ datachain-0.28.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ datachain-0.28.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
+ datachain-0.28.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
+ datachain-0.28.0.dist-info/RECORD,,