datachain 0.30.7__py3-none-any.whl → 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,4 +1,3 @@
1
- import base64
2
1
  import json
3
2
  import logging
4
3
  import os
@@ -7,6 +6,7 @@ from datetime import datetime, timedelta, timezone
7
6
  from struct import unpack
8
7
  from typing import (
9
8
  Any,
9
+ BinaryIO,
10
10
  Generic,
11
11
  Optional,
12
12
  TypeVar,
@@ -30,8 +30,9 @@ DatasetExportStatus = Optional[dict[str, Any]]
30
30
  DatasetExportSignedUrls = Optional[list[str]]
31
31
  FileUploadData = Optional[dict[str, Any]]
32
32
  JobData = Optional[dict[str, Any]]
33
- JobListData = dict[str, Any]
34
- ClusterListData = dict[str, Any]
33
+ JobListData = list[dict[str, Any]]
34
+ ClusterListData = list[dict[str, Any]]
35
+
35
36
  logger = logging.getLogger("datachain")
36
37
 
37
38
  DATASET_ROWS_CHUNK_SIZE = 8192
@@ -239,6 +240,45 @@ class StudioClient:
239
240
 
240
241
  return Response(data, ok, message, response.status_code)
241
242
 
243
+ def _send_multipart_request(
244
+ self, route: str, files: dict[str, Any], params: Optional[dict[str, Any]] = None
245
+ ) -> Response[Any]:
246
+ """
247
+ Function that communicates with Studio API using multipart/form-data.
248
+ It will raise an exception, and try to retry, if 5xx status code is
249
+ returned, or if Timeout exceptions is thrown from the requests lib
250
+ """
251
+ import requests
252
+
253
+ # Add team_name to params
254
+ request_params = {**(params or {}), "team_name": self.team}
255
+
256
+ response = requests.post(
257
+ url=f"{self.url}/{route}",
258
+ files=files,
259
+ params=request_params,
260
+ headers={
261
+ "Authorization": f"token {self.token}",
262
+ },
263
+ timeout=self.timeout,
264
+ )
265
+
266
+ ok = response.ok
267
+ try:
268
+ data = json.loads(response.content.decode("utf-8"))
269
+ except json.decoder.JSONDecodeError:
270
+ data = {}
271
+
272
+ if not ok:
273
+ if response.status_code == 403:
274
+ message = f"Not authorized for the team {self.team}"
275
+ else:
276
+ message = data.get("message", "")
277
+ else:
278
+ message = ""
279
+
280
+ return Response(data, ok, message, response.status_code)
281
+
242
282
  @staticmethod
243
283
  def _unpacker_hook(code, data):
244
284
  import msgpack
@@ -409,12 +449,13 @@ class StudioClient:
409
449
  method="GET",
410
450
  )
411
451
 
412
- def upload_file(self, content: bytes, file_name: str) -> Response[FileUploadData]:
413
- data = {
414
- "file_content": base64.b64encode(content).decode("utf-8"),
415
- "file_name": file_name,
416
- }
417
- return self._send_request("datachain/upload-file", data)
452
+ def upload_file(
453
+ self, file_obj: BinaryIO, file_name: str
454
+ ) -> Response[FileUploadData]:
455
+ # Prepare multipart form data
456
+ files = {"file": (file_name, file_obj, "application/octet-stream")}
457
+
458
+ return self._send_multipart_request("datachain/jobs/files", files)
418
459
 
419
460
  def create_job(
420
461
  self,
@@ -449,25 +490,27 @@ class StudioClient:
449
490
  "cron_expression": cron,
450
491
  "credentials_name": credentials_name,
451
492
  }
452
- return self._send_request("datachain/job", data)
493
+ return self._send_request("datachain/jobs/", data)
453
494
 
454
495
  def get_jobs(
455
496
  self,
456
497
  status: Optional[str] = None,
457
498
  limit: int = 20,
499
+ job_id: Optional[str] = None,
458
500
  ) -> Response[JobListData]:
459
- return self._send_request(
460
- "datachain/jobs",
461
- {"status": status, "limit": limit} if status else {"limit": limit},
462
- method="GET",
463
- )
501
+ params: dict[str, Any] = {"limit": limit}
502
+ if status is not None:
503
+ params["status"] = status
504
+ if job_id is not None:
505
+ params["job_id"] = job_id
506
+ return self._send_request("datachain/jobs/", params, method="GET")
464
507
 
465
508
  def cancel_job(
466
509
  self,
467
510
  job_id: str,
468
511
  ) -> Response[JobData]:
469
- url = f"datachain/job/{job_id}/cancel"
512
+ url = f"datachain/jobs/{job_id}/cancel"
470
513
  return self._send_request(url, data={}, method="POST")
471
514
 
472
515
  def get_clusters(self) -> Response[ClusterListData]:
473
- return self._send_request("datachain/clusters", {}, method="GET")
516
+ return self._send_request("datachain/clusters/", {}, method="GET")
datachain/studio.py CHANGED
@@ -403,14 +403,14 @@ def create_job(
403
403
  if not response.data:
404
404
  raise DataChainError("Failed to create job")
405
405
 
406
- job_id = response.data.get("job", {}).get("id")
406
+ job_id = response.data.get("id")
407
407
 
408
408
  if parsed_start_time or cron:
409
409
  print(f"Job {job_id} is scheduled as a task in Studio.")
410
410
  return 0
411
411
 
412
412
  print(f"Job {job_id} created")
413
- print("Open the job in Studio at", response.data.get("job", {}).get("url"))
413
+ print("Open the job in Studio at", response.data.get("url"))
414
414
  print("=" * 40)
415
415
 
416
416
  return 0 if no_wait else show_logs_from_client(client, job_id)
@@ -421,16 +421,14 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
421
421
  for file in files:
422
422
  file_name = os.path.basename(file)
423
423
  with open(file, "rb") as f:
424
- file_content = f.read()
425
- response = client.upload_file(file_content, file_name)
424
+ response = client.upload_file(f, file_name)
426
425
  if not response.ok:
427
426
  raise DataChainError(response.message)
428
427
 
429
428
  if not response.data:
430
429
  raise DataChainError(f"Failed to upload file {file_name}")
431
430
 
432
- file_id = response.data.get("blob", {}).get("id")
433
- if file_id:
431
+ if file_id := response.data.get("id"):
434
432
  file_ids.append(str(file_id))
435
433
  return file_ids
436
434
 
@@ -456,7 +454,7 @@ def list_jobs(status: Optional[str], team_name: Optional[str], limit: int):
456
454
  if not response.ok:
457
455
  raise DataChainError(response.message)
458
456
 
459
- jobs = response.data.get("jobs", [])
457
+ jobs = response.data or []
460
458
  if not jobs:
461
459
  print("No jobs found")
462
460
  return
@@ -492,7 +490,7 @@ def list_clusters(team_name: Optional[str]):
492
490
  if not response.ok:
493
491
  raise DataChainError(response.message)
494
492
 
495
- clusters = response.data.get("clusters", [])
493
+ clusters = response.data or []
496
494
  if not clusters:
497
495
  print("No clusters found")
498
496
  return
@@ -505,6 +503,7 @@ def list_clusters(team_name: Optional[str]):
505
503
  "Cloud Provider": cluster.get("cloud_provider"),
506
504
  "Cloud Credentials": cluster.get("cloud_credentials"),
507
505
  "Is Active": cluster.get("is_active"),
506
+ "Is Default": cluster.get("default"),
508
507
  "Max Workers": cluster.get("max_workers"),
509
508
  }
510
509
  for cluster in clusters
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.30.7
3
+ Version: 0.31.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -17,7 +17,7 @@ datachain/project.py,sha256=90D4GpJSA3t0fayYZbzrL3sk4U7EJhQo8psnWvdI7_o,2280
17
17
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
19
19
  datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
20
- datachain/studio.py,sha256=27750qCSNxIChEzhV02damIFreLMfr7UdiWqMFyk8AA,15361
20
+ datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
21
21
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
22
  datachain/utils.py,sha256=5ehFeqXau7MFmGUQRsjRyPfDMPoOF1ojpfVciYUo5fE,15659
23
23
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
@@ -136,7 +136,7 @@ datachain/query/session.py,sha256=gKblltJAVQAVSTswAgWGDgGbpmFlFzFVkIQojDCjgXM,68
136
136
  datachain/query/udf.py,sha256=jqutTpvkT6eHl96ZEgYiiTMAhI7vmTQA6JH9y4WCibI,1405
137
137
  datachain/query/utils.py,sha256=a2PTBZ3qsG6XlUcp9XsoGiQfKkca4Q3m-VzFgiGQPAc,1230
138
138
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- datachain/remote/studio.py,sha256=pDThxvEEpIKVGfa9rmtz_zeqHwrgzh0Lv-Pd4wzDx5k,15448
139
+ datachain/remote/studio.py,sha256=amjcV0B8qumsVBnxPQnt8oSrnfMK2vAdOurVMA9L_zA,16868
140
140
  datachain/sql/__init__.py,sha256=8D2omsBiATt8bjLjGo6jBEtaKEkOlnlNFWhVryHMDv0,388
141
141
  datachain/sql/postgresql_dialect.py,sha256=pDTfH8xaXz5xZsq8O1aQUvWLRIv_ogYeAqtmKlPp3Rw,280
142
142
  datachain/sql/postgresql_types.py,sha256=ryb_0lzuA9UOJ_B6nW9Yb8nJjzeSmEItAL_Ceue65lc,627
@@ -160,9 +160,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
160
160
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
161
161
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
162
162
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
163
- datachain-0.30.7.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
164
- datachain-0.30.7.dist-info/METADATA,sha256=d6ClkSVhY7AFkjh7jgUFEwHpTa7LhpJU75_M8ufegcI,13898
165
- datachain-0.30.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- datachain-0.30.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
167
- datachain-0.30.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
168
- datachain-0.30.7.dist-info/RECORD,,
163
+ datachain-0.31.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
164
+ datachain-0.31.0.dist-info/METADATA,sha256=hY_KVFdUHZmZcxRiy5e-GY6CXI-sY0oKAtrvNakApdY,13898
165
+ datachain-0.31.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ datachain-0.31.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
167
+ datachain-0.31.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
168
+ datachain-0.31.0.dist-info/RECORD,,