datachain 0.34.6__py3-none-any.whl → 0.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/catalog.py +75 -83
- datachain/catalog/loader.py +3 -3
- datachain/checkpoint.py +1 -2
- datachain/cli/__init__.py +2 -4
- datachain/cli/commands/datasets.py +13 -13
- datachain/cli/commands/ls.py +4 -4
- datachain/cli/commands/query.py +3 -3
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +1 -2
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +11 -21
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +4 -4
- datachain/client/local.py +4 -4
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +5 -5
- datachain/data_storage/metastore.py +107 -107
- datachain/data_storage/schema.py +18 -24
- datachain/data_storage/sqlite.py +21 -28
- datachain/data_storage/warehouse.py +13 -13
- datachain/dataset.py +64 -70
- datachain/delta.py +21 -18
- datachain/diff/__init__.py +13 -13
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +45 -42
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +54 -81
- datachain/job.py +8 -8
- datachain/lib/arrow.py +17 -14
- datachain/lib/audio.py +6 -6
- datachain/lib/clip.py +5 -4
- datachain/lib/convert/python_to_sql.py +4 -22
- datachain/lib/convert/values_to_tuples.py +4 -9
- datachain/lib/data_model.py +20 -19
- datachain/lib/dataset_info.py +6 -6
- datachain/lib/dc/csv.py +10 -10
- datachain/lib/dc/database.py +28 -29
- datachain/lib/dc/datachain.py +98 -97
- datachain/lib/dc/datasets.py +22 -22
- datachain/lib/dc/hf.py +4 -4
- datachain/lib/dc/json.py +9 -10
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +5 -5
- datachain/lib/dc/storage.py +12 -12
- datachain/lib/dc/storage_pattern.py +2 -2
- datachain/lib/dc/utils.py +11 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +32 -28
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +1 -2
- datachain/lib/model_store.py +3 -3
- datachain/lib/namespaces.py +4 -6
- datachain/lib/projects.py +5 -9
- datachain/lib/pytorch.py +10 -10
- datachain/lib/settings.py +23 -23
- datachain/lib/signal_schema.py +52 -44
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +25 -17
- datachain/lib/udf_signature.py +11 -11
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +30 -35
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +4 -4
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +4 -4
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +1 -7
- datachain/project.py +4 -4
- datachain/query/batch.py +7 -8
- datachain/query/dataset.py +80 -87
- datachain/query/dispatch.py +7 -7
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/schema.py +7 -6
- datachain/query/session.py +7 -7
- datachain/query/udf.py +8 -7
- datachain/query/utils.py +3 -5
- datachain/remote/studio.py +33 -39
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +6 -9
- datachain/studio.py +30 -30
- datachain/toolkit/split.py +1 -2
- datachain/utils.py +21 -21
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/METADATA +2 -3
- datachain-0.35.0.dist-info/RECORD +173 -0
- datachain-0.34.6.dist-info/RECORD +0 -173
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/WHEEL +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/top_level.txt +0 -0
datachain/remote/studio.py
CHANGED
|
@@ -4,13 +4,7 @@ import os
|
|
|
4
4
|
from collections.abc import AsyncIterator, Iterable, Iterator
|
|
5
5
|
from datetime import datetime, timedelta, timezone
|
|
6
6
|
from struct import unpack
|
|
7
|
-
from typing import
|
|
8
|
-
Any,
|
|
9
|
-
BinaryIO,
|
|
10
|
-
Generic,
|
|
11
|
-
Optional,
|
|
12
|
-
TypeVar,
|
|
13
|
-
)
|
|
7
|
+
from typing import Any, BinaryIO, Generic, TypeVar
|
|
14
8
|
from urllib.parse import urlparse, urlunparse
|
|
15
9
|
|
|
16
10
|
import websockets
|
|
@@ -22,14 +16,14 @@ from datachain.error import DataChainError
|
|
|
22
16
|
from datachain.utils import STUDIO_URL, retry_with_backoff
|
|
23
17
|
|
|
24
18
|
T = TypeVar("T")
|
|
25
|
-
LsData =
|
|
26
|
-
DatasetInfoData =
|
|
27
|
-
DatasetRowsData =
|
|
28
|
-
DatasetJobVersionsData =
|
|
29
|
-
DatasetExportStatus =
|
|
30
|
-
DatasetExportSignedUrls =
|
|
31
|
-
FileUploadData =
|
|
32
|
-
JobData =
|
|
19
|
+
LsData = list[dict[str, Any]] | None
|
|
20
|
+
DatasetInfoData = dict[str, Any] | None
|
|
21
|
+
DatasetRowsData = Iterable[dict[str, Any]] | None
|
|
22
|
+
DatasetJobVersionsData = dict[str, Any] | None
|
|
23
|
+
DatasetExportStatus = dict[str, Any] | None
|
|
24
|
+
DatasetExportSignedUrls = list[str] | None
|
|
25
|
+
FileUploadData = dict[str, Any] | None
|
|
26
|
+
JobData = dict[str, Any] | None
|
|
33
27
|
JobListData = list[dict[str, Any]]
|
|
34
28
|
ClusterListData = list[dict[str, Any]]
|
|
35
29
|
|
|
@@ -93,7 +87,7 @@ class Response(Generic[T]):
|
|
|
93
87
|
|
|
94
88
|
|
|
95
89
|
class StudioClient:
|
|
96
|
-
def __init__(self, timeout: float = 3600.0, team:
|
|
90
|
+
def __init__(self, timeout: float = 3600.0, team: str | None = None) -> None:
|
|
97
91
|
self._check_dependencies()
|
|
98
92
|
self.timeout = timeout
|
|
99
93
|
self._config = None
|
|
@@ -154,7 +148,7 @@ class StudioClient:
|
|
|
154
148
|
) from None
|
|
155
149
|
|
|
156
150
|
def _send_request_msgpack(
|
|
157
|
-
self, route: str, data: dict[str, Any], method:
|
|
151
|
+
self, route: str, data: dict[str, Any], method: str | None = "POST"
|
|
158
152
|
) -> Response[Any]:
|
|
159
153
|
import msgpack
|
|
160
154
|
import requests
|
|
@@ -192,7 +186,7 @@ class StudioClient:
|
|
|
192
186
|
|
|
193
187
|
@retry_with_backoff(retries=3, errors=(HTTPError, Timeout))
|
|
194
188
|
def _send_request(
|
|
195
|
-
self, route: str, data: dict[str, Any], method:
|
|
189
|
+
self, route: str, data: dict[str, Any], method: str | None = "POST"
|
|
196
190
|
) -> Response[Any]:
|
|
197
191
|
"""
|
|
198
192
|
Function that communicate Studio API.
|
|
@@ -241,7 +235,7 @@ class StudioClient:
|
|
|
241
235
|
return Response(data, ok, message, response.status_code)
|
|
242
236
|
|
|
243
237
|
def _send_multipart_request(
|
|
244
|
-
self, route: str, files: dict[str, Any], params:
|
|
238
|
+
self, route: str, files: dict[str, Any], params: dict[str, Any] | None = None
|
|
245
239
|
) -> Response[Any]:
|
|
246
240
|
"""
|
|
247
241
|
Function that communicates with Studio API using multipart/form-data.
|
|
@@ -345,7 +339,7 @@ class StudioClient:
|
|
|
345
339
|
response = self._send_request_msgpack("datachain/ls", {"source": path})
|
|
346
340
|
yield path, response
|
|
347
341
|
|
|
348
|
-
def ls_datasets(self, prefix:
|
|
342
|
+
def ls_datasets(self, prefix: str | None = None) -> Response[LsData]:
|
|
349
343
|
return self._send_request(
|
|
350
344
|
"datachain/datasets", {"prefix": prefix}, method="GET"
|
|
351
345
|
)
|
|
@@ -355,9 +349,9 @@ class StudioClient:
|
|
|
355
349
|
name: str,
|
|
356
350
|
namespace: str,
|
|
357
351
|
project: str,
|
|
358
|
-
new_name:
|
|
359
|
-
description:
|
|
360
|
-
attrs:
|
|
352
|
+
new_name: str | None = None,
|
|
353
|
+
description: str | None = None,
|
|
354
|
+
attrs: list[str] | None = None,
|
|
361
355
|
) -> Response[DatasetInfoData]:
|
|
362
356
|
body = {
|
|
363
357
|
"new_name": new_name,
|
|
@@ -378,8 +372,8 @@ class StudioClient:
|
|
|
378
372
|
name: str,
|
|
379
373
|
namespace: str,
|
|
380
374
|
project: str,
|
|
381
|
-
version:
|
|
382
|
-
force:
|
|
375
|
+
version: str | None = None,
|
|
376
|
+
force: bool | None = False,
|
|
383
377
|
) -> Response[DatasetInfoData]:
|
|
384
378
|
return self._send_request(
|
|
385
379
|
"datachain/datasets",
|
|
@@ -461,18 +455,18 @@ class StudioClient:
|
|
|
461
455
|
self,
|
|
462
456
|
query: str,
|
|
463
457
|
query_type: str,
|
|
464
|
-
environment:
|
|
465
|
-
workers:
|
|
466
|
-
query_name:
|
|
467
|
-
files:
|
|
468
|
-
python_version:
|
|
469
|
-
requirements:
|
|
470
|
-
repository:
|
|
471
|
-
priority:
|
|
472
|
-
cluster:
|
|
473
|
-
start_time:
|
|
474
|
-
cron:
|
|
475
|
-
credentials_name:
|
|
458
|
+
environment: str | None = None,
|
|
459
|
+
workers: int | None = None,
|
|
460
|
+
query_name: str | None = None,
|
|
461
|
+
files: list[str] | None = None,
|
|
462
|
+
python_version: str | None = None,
|
|
463
|
+
requirements: str | None = None,
|
|
464
|
+
repository: str | None = None,
|
|
465
|
+
priority: int | None = None,
|
|
466
|
+
cluster: str | None = None,
|
|
467
|
+
start_time: str | None = None,
|
|
468
|
+
cron: str | None = None,
|
|
469
|
+
credentials_name: str | None = None,
|
|
476
470
|
) -> Response[JobData]:
|
|
477
471
|
data = {
|
|
478
472
|
"query": query,
|
|
@@ -494,9 +488,9 @@ class StudioClient:
|
|
|
494
488
|
|
|
495
489
|
def get_jobs(
|
|
496
490
|
self,
|
|
497
|
-
status:
|
|
491
|
+
status: str | None = None,
|
|
498
492
|
limit: int = 20,
|
|
499
|
-
job_id:
|
|
493
|
+
job_id: str | None = None,
|
|
500
494
|
) -> Response[JobListData]:
|
|
501
495
|
params: dict[str, Any] = {"limit": limit}
|
|
502
496
|
if status is not None:
|
datachain/script_meta.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import Any
|
|
4
4
|
|
|
5
5
|
try:
|
|
6
6
|
import tomllib
|
|
@@ -59,23 +59,23 @@ class ScriptConfig:
|
|
|
59
59
|
|
|
60
60
|
"""
|
|
61
61
|
|
|
62
|
-
python_version:
|
|
62
|
+
python_version: str | None
|
|
63
63
|
dependencies: list[str]
|
|
64
64
|
attachments: dict[str, str]
|
|
65
65
|
params: dict[str, Any]
|
|
66
66
|
inputs: dict[str, Any]
|
|
67
67
|
outputs: dict[str, Any]
|
|
68
|
-
num_workers:
|
|
68
|
+
num_workers: int | None = None
|
|
69
69
|
|
|
70
70
|
def __init__(
|
|
71
71
|
self,
|
|
72
|
-
python_version:
|
|
73
|
-
dependencies:
|
|
74
|
-
attachments:
|
|
75
|
-
params:
|
|
76
|
-
inputs:
|
|
77
|
-
outputs:
|
|
78
|
-
num_workers:
|
|
72
|
+
python_version: str | None = None,
|
|
73
|
+
dependencies: list[str] | None = None,
|
|
74
|
+
attachments: dict[str, str] | None = None,
|
|
75
|
+
params: dict[str, Any] | None = None,
|
|
76
|
+
inputs: dict[str, Any] | None = None,
|
|
77
|
+
outputs: dict[str, Any] | None = None,
|
|
78
|
+
num_workers: int | None = None,
|
|
79
79
|
):
|
|
80
80
|
self.python_version = python_version
|
|
81
81
|
self.dependencies = dependencies or []
|
|
@@ -98,7 +98,7 @@ class ScriptConfig:
|
|
|
98
98
|
return self.attachments.get(name, default)
|
|
99
99
|
|
|
100
100
|
@staticmethod
|
|
101
|
-
def read(script: str) ->
|
|
101
|
+
def read(script: str) -> dict | None:
|
|
102
102
|
"""Converts inline script metadata to dict with all found data"""
|
|
103
103
|
regex = (
|
|
104
104
|
r"(?m)^# \/\/\/ (?P<type>[a-zA-Z0-9-]+)[ \t]*$[\r\n|\r|\n]"
|
|
@@ -119,7 +119,7 @@ class ScriptConfig:
|
|
|
119
119
|
return None
|
|
120
120
|
|
|
121
121
|
@staticmethod
|
|
122
|
-
def parse(script: str) ->
|
|
122
|
+
def parse(script: str) -> "ScriptConfig | None":
|
|
123
123
|
"""
|
|
124
124
|
Method that is parsing inline script metadata from datachain script and
|
|
125
125
|
instantiating ScriptConfig class with found data. If no inline metadata is
|
datachain/sql/sqlite/base.py
CHANGED
|
@@ -2,11 +2,10 @@ import logging
|
|
|
2
2
|
import re
|
|
3
3
|
import sqlite3
|
|
4
4
|
import warnings
|
|
5
|
-
from collections.abc import Iterable
|
|
5
|
+
from collections.abc import Callable, Iterable
|
|
6
6
|
from datetime import MAXYEAR, MINYEAR, datetime, timezone
|
|
7
7
|
from functools import cache
|
|
8
8
|
from types import MappingProxyType
|
|
9
|
-
from typing import Callable, Optional
|
|
10
9
|
|
|
11
10
|
import sqlalchemy as sa
|
|
12
11
|
import ujson as json
|
|
@@ -132,7 +131,7 @@ def run_compiler_hook(name):
|
|
|
132
131
|
|
|
133
132
|
|
|
134
133
|
def functions_exist(
|
|
135
|
-
names: Iterable[str], connection:
|
|
134
|
+
names: Iterable[str], connection: sqlite3.Connection | None = None
|
|
136
135
|
) -> bool:
|
|
137
136
|
"""
|
|
138
137
|
Returns True if all function names are defined for the given connection.
|
|
@@ -201,9 +200,7 @@ def sqlite_int_hash_64(x: int) -> int:
|
|
|
201
200
|
def sqlite_bit_hamming_distance(a: int, b: int) -> int:
|
|
202
201
|
"""Calculate the Hamming distance between two integers."""
|
|
203
202
|
diff = (a & MAX_INT64) ^ (b & MAX_INT64)
|
|
204
|
-
|
|
205
|
-
return diff.bit_count()
|
|
206
|
-
return bin(diff).count("1")
|
|
203
|
+
return diff.bit_count()
|
|
207
204
|
|
|
208
205
|
|
|
209
206
|
def sqlite_byte_hamming_distance(a: str, b: str) -> int:
|
|
@@ -215,7 +212,7 @@ def sqlite_byte_hamming_distance(a: str, b: str) -> int:
|
|
|
215
212
|
elif len(b) < len(a):
|
|
216
213
|
diff = len(a) - len(b)
|
|
217
214
|
a = a[: len(b)]
|
|
218
|
-
return diff + sum(c1 != c2 for c1, c2 in zip(a, b))
|
|
215
|
+
return diff + sum(c1 != c2 for c1, c2 in zip(a, b, strict=False))
|
|
219
216
|
|
|
220
217
|
|
|
221
218
|
def register_user_defined_sql_functions() -> None:
|
|
@@ -470,7 +467,7 @@ def py_json_array_get_element(val, idx):
|
|
|
470
467
|
return None
|
|
471
468
|
|
|
472
469
|
|
|
473
|
-
def py_json_array_slice(val, offset: int, length:
|
|
470
|
+
def py_json_array_slice(val, offset: int, length: int | None = None):
|
|
474
471
|
arr = json.loads(val)
|
|
475
472
|
try:
|
|
476
473
|
return json.dumps(
|
|
@@ -605,7 +602,7 @@ def compile_collect(element, compiler, **kwargs):
|
|
|
605
602
|
|
|
606
603
|
|
|
607
604
|
@cache
|
|
608
|
-
def usearch_sqlite_path() ->
|
|
605
|
+
def usearch_sqlite_path() -> str | None:
|
|
609
606
|
try:
|
|
610
607
|
import usearch
|
|
611
608
|
except ImportError:
|
datachain/studio.py
CHANGED
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
import os
|
|
3
3
|
import sys
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
6
|
|
|
7
7
|
import dateparser
|
|
8
8
|
import tabulate
|
|
@@ -175,7 +175,7 @@ def token():
|
|
|
175
175
|
print(token)
|
|
176
176
|
|
|
177
177
|
|
|
178
|
-
def list_datasets(team:
|
|
178
|
+
def list_datasets(team: str | None = None, name: str | None = None):
|
|
179
179
|
def ds_full_name(ds: dict) -> str:
|
|
180
180
|
return (
|
|
181
181
|
f"{ds['project']['namespace']['name']}.{ds['project']['name']}.{ds['name']}"
|
|
@@ -206,7 +206,7 @@ def list_datasets(team: Optional[str] = None, name: Optional[str] = None):
|
|
|
206
206
|
yield (full_name, version)
|
|
207
207
|
|
|
208
208
|
|
|
209
|
-
def list_dataset_versions(team:
|
|
209
|
+
def list_dataset_versions(team: str | None = None, name: str = ""):
|
|
210
210
|
client = StudioClient(team=team)
|
|
211
211
|
|
|
212
212
|
namespace_name, project_name, name = parse_dataset_name(name)
|
|
@@ -226,13 +226,13 @@ def list_dataset_versions(team: Optional[str] = None, name: str = ""):
|
|
|
226
226
|
|
|
227
227
|
|
|
228
228
|
def edit_studio_dataset(
|
|
229
|
-
team_name:
|
|
229
|
+
team_name: str | None,
|
|
230
230
|
name: str,
|
|
231
231
|
namespace: str,
|
|
232
232
|
project: str,
|
|
233
|
-
new_name:
|
|
234
|
-
description:
|
|
235
|
-
attrs:
|
|
233
|
+
new_name: str | None = None,
|
|
234
|
+
description: str | None = None,
|
|
235
|
+
attrs: list[str] | None = None,
|
|
236
236
|
):
|
|
237
237
|
client = StudioClient(team=team_name)
|
|
238
238
|
response = client.edit_dataset(
|
|
@@ -245,12 +245,12 @@ def edit_studio_dataset(
|
|
|
245
245
|
|
|
246
246
|
|
|
247
247
|
def remove_studio_dataset(
|
|
248
|
-
team_name:
|
|
248
|
+
team_name: str | None,
|
|
249
249
|
name: str,
|
|
250
250
|
namespace: str,
|
|
251
251
|
project: str,
|
|
252
|
-
version:
|
|
253
|
-
force:
|
|
252
|
+
version: str | None = None,
|
|
253
|
+
force: bool | None = False,
|
|
254
254
|
):
|
|
255
255
|
client = StudioClient(team=team_name)
|
|
256
256
|
response = client.rm_dataset(name, namespace, project, version, force)
|
|
@@ -271,7 +271,7 @@ def save_config(hostname, token, level=ConfigLevel.GLOBAL):
|
|
|
271
271
|
return config.config_file()
|
|
272
272
|
|
|
273
273
|
|
|
274
|
-
def parse_start_time(start_time_str:
|
|
274
|
+
def parse_start_time(start_time_str: str | None) -> str | None:
|
|
275
275
|
if not start_time_str:
|
|
276
276
|
return None
|
|
277
277
|
|
|
@@ -343,21 +343,21 @@ def show_logs_from_client(client, job_id):
|
|
|
343
343
|
|
|
344
344
|
def create_job(
|
|
345
345
|
query_file: str,
|
|
346
|
-
team_name:
|
|
347
|
-
env_file:
|
|
348
|
-
env:
|
|
349
|
-
workers:
|
|
350
|
-
files:
|
|
351
|
-
python_version:
|
|
352
|
-
repository:
|
|
353
|
-
req:
|
|
354
|
-
req_file:
|
|
355
|
-
priority:
|
|
356
|
-
cluster:
|
|
357
|
-
start_time:
|
|
358
|
-
cron:
|
|
359
|
-
no_wait:
|
|
360
|
-
credentials_name:
|
|
346
|
+
team_name: str | None,
|
|
347
|
+
env_file: str | None = None,
|
|
348
|
+
env: list[str] | None = None,
|
|
349
|
+
workers: int | None = None,
|
|
350
|
+
files: list[str] | None = None,
|
|
351
|
+
python_version: str | None = None,
|
|
352
|
+
repository: str | None = None,
|
|
353
|
+
req: list[str] | None = None,
|
|
354
|
+
req_file: str | None = None,
|
|
355
|
+
priority: int | None = None,
|
|
356
|
+
cluster: str | None = None,
|
|
357
|
+
start_time: str | None = None,
|
|
358
|
+
cron: str | None = None,
|
|
359
|
+
no_wait: bool | None = False,
|
|
360
|
+
credentials_name: str | None = None,
|
|
361
361
|
):
|
|
362
362
|
query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
|
|
363
363
|
with open(query_file) as f:
|
|
@@ -433,7 +433,7 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
|
|
|
433
433
|
return file_ids
|
|
434
434
|
|
|
435
435
|
|
|
436
|
-
def cancel_job(job_id: str, team_name:
|
|
436
|
+
def cancel_job(job_id: str, team_name: str | None):
|
|
437
437
|
token = Config().read().get("studio", {}).get("token")
|
|
438
438
|
if not token:
|
|
439
439
|
raise DataChainError(
|
|
@@ -448,7 +448,7 @@ def cancel_job(job_id: str, team_name: Optional[str]):
|
|
|
448
448
|
print(f"Job {job_id} canceled")
|
|
449
449
|
|
|
450
450
|
|
|
451
|
-
def list_jobs(status:
|
|
451
|
+
def list_jobs(status: str | None, team_name: str | None, limit: int):
|
|
452
452
|
client = StudioClient(team=team_name)
|
|
453
453
|
response = client.get_jobs(status, limit)
|
|
454
454
|
if not response.ok:
|
|
@@ -473,7 +473,7 @@ def list_jobs(status: Optional[str], team_name: Optional[str], limit: int):
|
|
|
473
473
|
print(tabulate.tabulate(rows, headers="keys", tablefmt="grid"))
|
|
474
474
|
|
|
475
475
|
|
|
476
|
-
def show_job_logs(job_id: str, team_name:
|
|
476
|
+
def show_job_logs(job_id: str, team_name: str | None):
|
|
477
477
|
token = Config().read().get("studio", {}).get("token")
|
|
478
478
|
if not token:
|
|
479
479
|
raise DataChainError(
|
|
@@ -484,7 +484,7 @@ def show_job_logs(job_id: str, team_name: Optional[str]):
|
|
|
484
484
|
return show_logs_from_client(client, job_id)
|
|
485
485
|
|
|
486
486
|
|
|
487
|
-
def list_clusters(team_name:
|
|
487
|
+
def list_clusters(team_name: str | None):
|
|
488
488
|
client = StudioClient(team=team_name)
|
|
489
489
|
response = client.get_clusters()
|
|
490
490
|
if not response.ok:
|
datachain/toolkit/split.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import random
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
from datachain import C, DataChain
|
|
5
4
|
|
|
@@ -9,7 +8,7 @@ RESOLUTION = 2**31 - 1 # Maximum positive value for a 32-bit signed integer.
|
|
|
9
8
|
def train_test_split(
|
|
10
9
|
dc: DataChain,
|
|
11
10
|
weights: list[float],
|
|
12
|
-
seed:
|
|
11
|
+
seed: int | None = None,
|
|
13
12
|
) -> list[DataChain]:
|
|
14
13
|
"""
|
|
15
14
|
Splits a DataChain into multiple subsets based on the provided weights.
|
datachain/utils.py
CHANGED
|
@@ -11,7 +11,7 @@ import time
|
|
|
11
11
|
from collections.abc import Iterable, Iterator, Sequence
|
|
12
12
|
from contextlib import contextmanager
|
|
13
13
|
from datetime import date, datetime, timezone
|
|
14
|
-
from typing import TYPE_CHECKING, Any,
|
|
14
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
15
15
|
from uuid import UUID
|
|
16
16
|
|
|
17
17
|
import cloudpickle
|
|
@@ -53,11 +53,11 @@ class DataChainDir:
|
|
|
53
53
|
|
|
54
54
|
def __init__(
|
|
55
55
|
self,
|
|
56
|
-
root:
|
|
57
|
-
cache:
|
|
58
|
-
tmp:
|
|
59
|
-
db:
|
|
60
|
-
config:
|
|
56
|
+
root: str | None = None,
|
|
57
|
+
cache: str | None = None,
|
|
58
|
+
tmp: str | None = None,
|
|
59
|
+
db: str | None = None,
|
|
60
|
+
config: str | None = None,
|
|
61
61
|
) -> None:
|
|
62
62
|
self.root = osp.abspath(root) if root is not None else self.default_root()
|
|
63
63
|
self.cache = (
|
|
@@ -122,7 +122,7 @@ def global_config_dir():
|
|
|
122
122
|
)
|
|
123
123
|
|
|
124
124
|
|
|
125
|
-
def human_time_to_int(time: str) ->
|
|
125
|
+
def human_time_to_int(time: str) -> int | None:
|
|
126
126
|
if not time:
|
|
127
127
|
return None
|
|
128
128
|
|
|
@@ -146,7 +146,7 @@ def time_to_str(dt):
|
|
|
146
146
|
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
147
147
|
|
|
148
148
|
|
|
149
|
-
def time_to_local(dt:
|
|
149
|
+
def time_to_local(dt: datetime | str) -> datetime:
|
|
150
150
|
# TODO check usage
|
|
151
151
|
if isinstance(dt, str):
|
|
152
152
|
dt = isoparse(dt)
|
|
@@ -156,11 +156,11 @@ def time_to_local(dt: Union[datetime, str]) -> datetime:
|
|
|
156
156
|
return dt
|
|
157
157
|
|
|
158
158
|
|
|
159
|
-
def time_to_local_str(dt:
|
|
159
|
+
def time_to_local_str(dt: datetime | str) -> str:
|
|
160
160
|
return time_to_str(time_to_local(dt))
|
|
161
161
|
|
|
162
162
|
|
|
163
|
-
def is_expired(expires:
|
|
163
|
+
def is_expired(expires: datetime | str | None):
|
|
164
164
|
if expires:
|
|
165
165
|
return time_to_local(expires) < time_to_local(datetime.now()) # noqa: DTZ005
|
|
166
166
|
|
|
@@ -301,9 +301,9 @@ def retry_with_backoff(retries=5, backoff_sec=1, errors=(Exception,)):
|
|
|
301
301
|
|
|
302
302
|
|
|
303
303
|
def determine_workers(
|
|
304
|
-
workers:
|
|
305
|
-
rows_total:
|
|
306
|
-
) ->
|
|
304
|
+
workers: bool | int,
|
|
305
|
+
rows_total: int | None = None,
|
|
306
|
+
) -> bool | int:
|
|
307
307
|
"""Determine the number of workers to use for distributed processing."""
|
|
308
308
|
if rows_total is not None and rows_total <= 1:
|
|
309
309
|
# Disable distributed processing if there is no rows or only one row.
|
|
@@ -322,9 +322,9 @@ def determine_workers(
|
|
|
322
322
|
|
|
323
323
|
|
|
324
324
|
def determine_processes(
|
|
325
|
-
parallel:
|
|
326
|
-
rows_total:
|
|
327
|
-
) ->
|
|
325
|
+
parallel: bool | int | None = None,
|
|
326
|
+
rows_total: int | None = None,
|
|
327
|
+
) -> bool | int:
|
|
328
328
|
"""Determine the number of processes to use for parallel processing."""
|
|
329
329
|
if rows_total is not None and rows_total <= 1:
|
|
330
330
|
# Disable parallel processing if there is no rows or only one row.
|
|
@@ -344,8 +344,8 @@ def determine_processes(
|
|
|
344
344
|
|
|
345
345
|
|
|
346
346
|
def get_env_list(
|
|
347
|
-
key: str, default:
|
|
348
|
-
) ->
|
|
347
|
+
key: str, default: Sequence | None = None, sep: str = ","
|
|
348
|
+
) -> Sequence[str] | None:
|
|
349
349
|
try:
|
|
350
350
|
str_val = os.environ[key]
|
|
351
351
|
except KeyError:
|
|
@@ -386,10 +386,10 @@ def show_df(
|
|
|
386
386
|
|
|
387
387
|
|
|
388
388
|
def show_records(
|
|
389
|
-
records:
|
|
389
|
+
records: list[dict] | None,
|
|
390
390
|
collapse_columns: bool = False,
|
|
391
391
|
system_columns: bool = False,
|
|
392
|
-
hidden_fields:
|
|
392
|
+
hidden_fields: list[str] | None = None,
|
|
393
393
|
) -> None:
|
|
394
394
|
import pandas as pd
|
|
395
395
|
|
|
@@ -518,7 +518,7 @@ def row_to_nested_dict(
|
|
|
518
518
|
) -> dict[str, Any]:
|
|
519
519
|
"""Converts a row to a nested dict based on the provided headers."""
|
|
520
520
|
result: dict[str, Any] = {}
|
|
521
|
-
for h, v in zip(headers, row):
|
|
521
|
+
for h, v in zip(headers, row, strict=False):
|
|
522
522
|
nested_dict_path_set(result, h, v)
|
|
523
523
|
return result
|
|
524
524
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.35.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -8,13 +8,12 @@ Project-URL: Documentation, https://datachain.dvc.ai
|
|
|
8
8
|
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
9
|
Project-URL: Source, https://github.com/iterative/datachain
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
16
15
|
Classifier: Development Status :: 2 - Pre-Alpha
|
|
17
|
-
Requires-Python: >=3.
|
|
16
|
+
Requires-Python: >=3.10
|
|
18
17
|
Description-Content-Type: text/x-rst
|
|
19
18
|
License-File: LICENSE
|
|
20
19
|
Requires-Dist: pyyaml
|