datachain 0.26.3__py3-none-any.whl → 0.26.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -17,7 +17,12 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
17
17
  )
18
18
 
19
19
  studio_run_help = "Run a job in Studio"
20
- studio_run_description = "Run a job in Studio."
20
+ studio_run_description = "Run a job in Studio. \n"
21
+ studio_run_description += (
22
+ "When using --start-time or --cron,"
23
+ " the job is scheduled as a task and will not show logs immediately."
24
+ " The job will be executed according to the schedule."
25
+ )
21
26
 
22
27
  studio_run_parser = jobs_subparser.add_parser(
23
28
  "run",
@@ -96,6 +101,14 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
96
101
  help="Priority for the job in range 0-5. "
97
102
  "Lower value is higher priority (default: 5)",
98
103
  )
104
+ studio_run_parser.add_argument(
105
+ "--start-time",
106
+ action="store",
107
+ help="Start time in ISO format or natural language for the cron task.",
108
+ )
109
+ studio_run_parser.add_argument(
110
+ "--cron", action="store", help="Cron expression for the cron task."
111
+ )
99
112
 
100
113
  studio_ls_help = "List jobs in Studio"
101
114
  studio_ls_description = "List jobs in Studio."
@@ -2388,7 +2388,7 @@ class DataChain:
2388
2388
  placement: FileExportPlacement = "fullpath",
2389
2389
  link_type: Literal["copy", "symlink"] = "copy",
2390
2390
  num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
2391
- anon: bool = False,
2391
+ anon: Optional[bool] = None,
2392
2392
  client_config: Optional[dict] = None,
2393
2393
  ) -> None:
2394
2394
  """Export files from a specified signal to a directory. Files can be
@@ -2403,7 +2403,11 @@ class DataChain:
2403
2403
  Falls back to `'copy'` if symlinking fails.
2404
2404
  num_threads : number of threads to use for exporting files.
2405
2405
  By default it uses 5 threads.
2406
- anon: If true, we will treat cloud bucket as public one
2406
+ anon: If True, we will treat cloud bucket as public one. Default behavior
2407
+ depends on the previous session configuration (e.g. happens in the
2408
+ initial `read_storage`) and particular cloud storage client
2409
+ implementation (e.g. S3 fallbacks to anonymous access if no credentials
2410
+ were found).
2407
2411
  client_config: Optional configuration for the destination storage client
2408
2412
 
2409
2413
  Example:
@@ -2421,8 +2425,8 @@ class DataChain:
2421
2425
  ):
2422
2426
  raise ValueError("Files with the same name found")
2423
2427
 
2424
- if anon:
2425
- client_config = (client_config or {}) | {"anon": True}
2428
+ if anon is not None:
2429
+ client_config = (client_config or {}) | {"anon": anon}
2426
2430
 
2427
2431
  progress_bar = tqdm(
2428
2432
  desc=f"Exporting files to {output}: ",
@@ -33,7 +33,7 @@ def read_storage(
33
33
  recursive: Optional[bool] = True,
34
34
  column: str = "file",
35
35
  update: bool = False,
36
- anon: bool = False,
36
+ anon: Optional[bool] = None,
37
37
  delta: Optional[bool] = False,
38
38
  delta_on: Optional[Union[str, Sequence[str]]] = (
39
39
  "file.path",
@@ -124,8 +124,8 @@ def read_storage(
124
124
 
125
125
  file_type = get_file_type(type)
126
126
 
127
- if anon:
128
- client_config = (client_config or {}) | {"anon": True}
127
+ if anon is not None:
128
+ client_config = (client_config or {}) | {"anon": anon}
129
129
  session = Session.get(session, client_config=client_config, in_memory=in_memory)
130
130
  catalog = session.catalog
131
131
  cache = catalog.cache
datachain/lib/file.py CHANGED
@@ -717,6 +717,23 @@ class ImageFile(File):
717
717
  destination = stringify_path(destination)
718
718
 
719
719
  client: Client = self._catalog.get_client(destination, **(client_config or {}))
720
+
721
+ # If format is not provided, determine it from the file extension
722
+ if format is None:
723
+ from pathlib import PurePosixPath
724
+
725
+ from PIL import Image as PilImage
726
+
727
+ ext = PurePosixPath(destination).suffix.lower()
728
+ format = PilImage.registered_extensions().get(ext)
729
+
730
+ if not format:
731
+ raise FileError(
732
+ f"Can't determine format for destination '{destination}'",
733
+ self.source,
734
+ self.path,
735
+ )
736
+
720
737
  with client.fs.open(destination, mode="wb") as f:
721
738
  self.read().save(f, format=format)
722
739
 
@@ -429,6 +429,8 @@ class StudioClient:
429
429
  repository: Optional[str] = None,
430
430
  priority: Optional[int] = None,
431
431
  cluster: Optional[str] = None,
432
+ start_time: Optional[str] = None,
433
+ cron: Optional[str] = None,
432
434
  ) -> Response[JobData]:
433
435
  data = {
434
436
  "query": query,
@@ -442,6 +444,8 @@ class StudioClient:
442
444
  "repository": repository,
443
445
  "priority": priority,
444
446
  "compute_cluster_name": cluster,
447
+ "start_after": start_time,
448
+ "cron_expression": cron,
445
449
  }
446
450
  return self._send_request("datachain/job", data)
447
451
 
datachain/studio.py CHANGED
@@ -1,8 +1,10 @@
1
1
  import asyncio
2
2
  import os
3
3
  import sys
4
+ from datetime import datetime, timezone
4
5
  from typing import TYPE_CHECKING, Optional
5
6
 
7
+ import dateparser
6
8
  import tabulate
7
9
 
8
10
  from datachain.config import Config, ConfigLevel
@@ -42,6 +44,8 @@ def process_jobs_args(args: "Namespace"):
42
44
  args.req_file,
43
45
  args.priority,
44
46
  args.cluster,
47
+ args.start_time,
48
+ args.cron,
45
49
  )
46
50
 
47
51
  if args.cmd == "cancel":
@@ -262,6 +266,31 @@ def save_config(hostname, token, level=ConfigLevel.GLOBAL):
262
266
  return config.config_file()
263
267
 
264
268
 
269
+ def parse_start_time(start_time_str: Optional[str]) -> Optional[str]:
270
+ if not start_time_str:
271
+ return None
272
+
273
+ try:
274
+ # Parse the datetime string using dateparser
275
+ parsed_datetime = dateparser.parse(start_time_str)
276
+
277
+ if parsed_datetime is None:
278
+ raise DataChainError(
279
+ f"Could not parse datetime string: '{start_time_str}'. "
280
+ f"Supported formats include: '2024-01-15 14:30:00', 'tomorrow 3pm', "
281
+ f"'monday 9am', '2024-01-15T14:30:00Z', 'in 2 hours', etc."
282
+ )
283
+
284
+ # Convert to ISO format string
285
+ return parsed_datetime.isoformat()
286
+ except Exception as e:
287
+ raise DataChainError(
288
+ f"Invalid datetime format for start_time: '{start_time_str}'. "
289
+ f"Supported formats include: '2024-01-15 14:30:00', 'tomorrow 3pm', "
290
+ f"'monday 9am', '2024-01-15T14:30:00Z', 'in 2 hours', etc. Error: {e}"
291
+ ) from e
292
+
293
+
265
294
  def show_logs_from_client(client, job_id):
266
295
  # Sync usage
267
296
  async def _run():
@@ -310,6 +339,8 @@ def create_job(
310
339
  req_file: Optional[str] = None,
311
340
  priority: Optional[int] = None,
312
341
  cluster: Optional[str] = None,
342
+ start_time: Optional[str] = None,
343
+ cron: Optional[str] = None,
313
344
  ):
314
345
  query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
315
346
  with open(query_file) as f:
@@ -328,6 +359,11 @@ def create_job(
328
359
  client = StudioClient(team=team_name)
329
360
  file_ids = upload_files(client, files) if files else []
330
361
 
362
+ # Parse start_time if provided
363
+ parsed_start_time = parse_start_time(start_time)
364
+ if cron and parsed_start_time is None:
365
+ parsed_start_time = datetime.now(timezone.utc).isoformat()
366
+
331
367
  response = client.create_job(
332
368
  query=query,
333
369
  query_type=query_type,
@@ -340,6 +376,8 @@ def create_job(
340
376
  requirements=requirements,
341
377
  priority=priority,
342
378
  cluster=cluster,
379
+ start_time=parsed_start_time,
380
+ cron=cron,
343
381
  )
344
382
  if not response.ok:
345
383
  raise DataChainError(response.message)
@@ -348,6 +386,11 @@ def create_job(
348
386
  raise DataChainError("Failed to create job")
349
387
 
350
388
  job_id = response.data.get("job", {}).get("id")
389
+
390
+ if parsed_start_time or cron:
391
+ print(f"Job {job_id} is scheduled as a task in Studio.")
392
+ return 0
393
+
351
394
  print(f"Job {job_id} created")
352
395
  print("Open the job in Studio at", response.data.get("job", {}).get("url"))
353
396
  print("=" * 40)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.3
3
+ Version: 0.26.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -26,6 +26,7 @@ Requires-Dist: packaging
26
26
  Requires-Dist: pyarrow
27
27
  Requires-Dist: typing-extensions
28
28
  Requires-Dist: python-dateutil>=2
29
+ Requires-Dist: dateparser>=1.0.0
29
30
  Requires-Dist: attrs>=21.3.0
30
31
  Requires-Dist: fsspec>=2024.2.0
31
32
  Requires-Dist: s3fs>=2024.2.0
@@ -100,6 +101,7 @@ Provides-Extra: dev
100
101
  Requires-Dist: datachain[docs,tests]; extra == "dev"
101
102
  Requires-Dist: mypy==1.17.0; extra == "dev"
102
103
  Requires-Dist: types-python-dateutil; extra == "dev"
104
+ Requires-Dist: types-dateparser; extra == "dev"
103
105
  Requires-Dist: types-pytz; extra == "dev"
104
106
  Requires-Dist: types-PyYAML; extra == "dev"
105
107
  Requires-Dist: types-requests; extra == "dev"
@@ -17,7 +17,7 @@ datachain/project.py,sha256=90D4GpJSA3t0fayYZbzrL3sk4U7EJhQo8psnWvdI7_o,2280
17
17
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
19
19
  datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
20
- datachain/studio.py,sha256=bLok-eJNFRHQScEyAyA_Fas52dmijd5r-73KudWxV4k,13337
20
+ datachain/studio.py,sha256=w5RyntqSl6qOs2mbw4Dc7SpZNNEN97xpvjxfJL0rO7M,14850
21
21
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
22
  datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
23
23
  datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
@@ -35,7 +35,7 @@ datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibV
35
35
  datachain/cli/commands/query.py,sha256=Xzfgh14nPVH-sclqX1tpZqgfdTugw5s_44v0D33z6FA,1505
36
36
  datachain/cli/commands/show.py,sha256=Cf8wBs12h-xtdOzjU5GTDy2C8rF5HJSF0hDJYER1zH8,1606
37
37
  datachain/cli/parser/__init__.py,sha256=NPB6ssP4CCt7G1SWZ_8oNQEH2C1lktWgkyHYXDQJZNc,15073
38
- datachain/cli/parser/job.py,sha256=_wqOOxGRXG_-xuQ35FaLUOwjw6w8HviWvoEpZZ7VBzI,5289
38
+ datachain/cli/parser/job.py,sha256=2_g46bx_p7DnqZoYsXY2rHlB07BjBCuRPzpGP-Duk-s,5804
39
39
  datachain/cli/parser/studio.py,sha256=Bo__LKM7qhJGgkyX8M_bCvgZ2Gvqq6r_X4t1NdtaBIY,3881
40
40
  datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
41
41
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
@@ -75,7 +75,7 @@ datachain/lib/audio.py,sha256=J7XJ14ItPF9y6pN-tmMV9In9X9rgwlBwzyzdGOUkPGk,4376
75
75
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
76
76
  datachain/lib/data_model.py,sha256=JPHPO6z-pehyiY-qNBAnp8u015xUHrijPKbGkMHS6lo,3493
77
77
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
78
- datachain/lib/file.py,sha256=tHBBacsh1580UPFC6fAINBNwNiyymNgzj89rpsz1LKc,40817
78
+ datachain/lib/file.py,sha256=vlSFsmj0ltvQWG6_isfWwNZt5u002bwrl70J2KbdvDE,41335
79
79
  datachain/lib/hf.py,sha256=dadHs2dsi4ALwXz92Y3T7AUgq3wQF4mBydWqHCMjvks,6880
80
80
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
81
81
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
@@ -104,7 +104,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
104
104
  datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
105
105
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
106
106
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
107
- datachain/lib/dc/datachain.py,sha256=ap54lcuj71tvp0zX1jiFFiEWvA5UPeyYJRJkd2APmlI,92897
107
+ datachain/lib/dc/datachain.py,sha256=mLE5v4KhzEQm7HVWBTxY6EwJ2J-YeFVcLUY4I21216c,93212
108
108
  datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
109
109
  datachain/lib/dc/hf.py,sha256=MJWO-NL4jAD6CEAmXsyeqXEyvefRLMhyxhT9jKT5vMU,2324
110
110
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
@@ -112,7 +112,7 @@ datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,
112
112
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
113
113
  datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
114
114
  datachain/lib/dc/records.py,sha256=FpPbApWopUri1gIaSMsfXN4fevja4mjmfb6Q5eiaGxI,3116
115
- datachain/lib/dc/storage.py,sha256=8xiV3c6k-sG14RGwNJCp0AbV6L0mNDsTVZ-Est-ccnw,7672
115
+ datachain/lib/dc/storage.py,sha256=FXroEdxOZfbuEBIWfWTkbGwrI0D4_mrLZSRsIQm0WFE,7693
116
116
  datachain/lib/dc/utils.py,sha256=VawOAlJSvAtZbsMg33s5tJe21TRx1Km3QggI1nN6tnw,3984
117
117
  datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
118
118
  datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -136,7 +136,7 @@ datachain/query/session.py,sha256=gKblltJAVQAVSTswAgWGDgGbpmFlFzFVkIQojDCjgXM,68
136
136
  datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
137
137
  datachain/query/utils.py,sha256=HaSDNH_XGvp_NIcXjcB7j4vJRPi4_tbztDWclYelHY4,1208
138
138
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- datachain/remote/studio.py,sha256=oJp2KD9eO8zQDnPfNpAALZYsOlBfqVKKRTeCkEpcsYk,15196
139
+ datachain/remote/studio.py,sha256=vsuqCAO65PBJKGLMxOvc3Bmieo2TJwcfc9YclxkzmFk,15350
140
140
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
141
141
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
142
142
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -158,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
158
158
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
159
159
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
160
160
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
161
- datachain-0.26.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
- datachain-0.26.3.dist-info/METADATA,sha256=HdG_quEq0rfrdKJJ_teSViVCXbXI3SxLlnh6tu2Mgfs,13543
163
- datachain-0.26.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- datachain-0.26.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
- datachain-0.26.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
- datachain-0.26.3.dist-info/RECORD,,
161
+ datachain-0.26.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
+ datachain-0.26.4.dist-info/METADATA,sha256=oWaaj_Avr95dDdM_txeheiOefsoHuXTu0QR71hTN634,13624
163
+ datachain-0.26.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ datachain-0.26.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
+ datachain-0.26.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
+ datachain-0.26.4.dist-info/RECORD,,