datachain 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +61 -219
- datachain/cli.py +136 -22
- datachain/client/fsspec.py +9 -0
- datachain/client/local.py +11 -32
- datachain/config.py +126 -51
- datachain/data_storage/schema.py +66 -33
- datachain/data_storage/sqlite.py +4 -4
- datachain/data_storage/warehouse.py +101 -125
- datachain/lib/dc.py +211 -52
- datachain/lib/func/__init__.py +20 -2
- datachain/lib/func/aggregate.py +319 -8
- datachain/lib/func/func.py +97 -9
- datachain/lib/listing.py +6 -21
- datachain/lib/listing_info.py +4 -0
- datachain/lib/signal_schema.py +8 -5
- datachain/lib/udf.py +3 -3
- datachain/listing.py +22 -48
- datachain/query/dataset.py +11 -3
- datachain/remote/studio.py +63 -14
- datachain/studio.py +129 -0
- datachain/utils.py +58 -0
- {datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/METADATA +7 -6
- {datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/RECORD +27 -26
- {datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/WHEEL +1 -1
- {datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/LICENSE +0 -0
- {datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/top_level.txt +0 -0
datachain/listing.py
CHANGED
|
@@ -4,12 +4,10 @@ from collections.abc import Iterable, Iterator
|
|
|
4
4
|
from itertools import zip_longest
|
|
5
5
|
from typing import TYPE_CHECKING, Optional
|
|
6
6
|
|
|
7
|
-
from fsspec.asyn import get_loop, sync
|
|
8
7
|
from sqlalchemy import Column
|
|
9
8
|
from sqlalchemy.sql import func
|
|
10
9
|
from tqdm import tqdm
|
|
11
10
|
|
|
12
|
-
from datachain.lib.file import File
|
|
13
11
|
from datachain.node import DirType, Node, NodeWithPath
|
|
14
12
|
from datachain.sql.functions import path as pathfunc
|
|
15
13
|
from datachain.utils import suffix_to_number
|
|
@@ -17,33 +15,29 @@ from datachain.utils import suffix_to_number
|
|
|
17
15
|
if TYPE_CHECKING:
|
|
18
16
|
from datachain.catalog.datasource import DataSource
|
|
19
17
|
from datachain.client import Client
|
|
20
|
-
from datachain.data_storage import
|
|
18
|
+
from datachain.data_storage import AbstractWarehouse
|
|
21
19
|
from datachain.dataset import DatasetRecord
|
|
22
|
-
from datachain.storage import Storage
|
|
23
20
|
|
|
24
21
|
|
|
25
22
|
class Listing:
|
|
26
23
|
def __init__(
|
|
27
24
|
self,
|
|
28
|
-
storage: Optional["Storage"],
|
|
29
|
-
metastore: "AbstractMetastore",
|
|
30
25
|
warehouse: "AbstractWarehouse",
|
|
31
26
|
client: "Client",
|
|
32
27
|
dataset: Optional["DatasetRecord"],
|
|
28
|
+
object_name: str = "file",
|
|
33
29
|
):
|
|
34
|
-
self.storage = storage
|
|
35
|
-
self.metastore = metastore
|
|
36
30
|
self.warehouse = warehouse
|
|
37
31
|
self.client = client
|
|
38
32
|
self.dataset = dataset # dataset representing bucket listing
|
|
33
|
+
self.object_name = object_name
|
|
39
34
|
|
|
40
35
|
def clone(self) -> "Listing":
|
|
41
36
|
return self.__class__(
|
|
42
|
-
self.storage,
|
|
43
|
-
self.metastore.clone(),
|
|
44
37
|
self.warehouse.clone(),
|
|
45
38
|
self.client,
|
|
46
39
|
self.dataset,
|
|
40
|
+
self.object_name,
|
|
47
41
|
)
|
|
48
42
|
|
|
49
43
|
def __enter__(self) -> "Listing":
|
|
@@ -53,46 +47,20 @@ class Listing:
|
|
|
53
47
|
self.close()
|
|
54
48
|
|
|
55
49
|
def close(self) -> None:
|
|
56
|
-
self.metastore.close()
|
|
57
50
|
self.warehouse.close()
|
|
58
51
|
|
|
59
52
|
@property
|
|
60
|
-
def
|
|
61
|
-
|
|
53
|
+
def uri(self):
|
|
54
|
+
from datachain.lib.listing import listing_uri_from_name
|
|
55
|
+
|
|
56
|
+
return listing_uri_from_name(self.dataset.name)
|
|
62
57
|
|
|
63
58
|
@property
|
|
64
59
|
def dataset_rows(self):
|
|
65
|
-
return self.warehouse.dataset_rows(
|
|
66
|
-
|
|
67
|
-
def fetch(self, start_prefix="", method: str = "default") -> None:
|
|
68
|
-
sync(get_loop(), self._fetch, start_prefix, method)
|
|
69
|
-
|
|
70
|
-
async def _fetch(self, start_prefix: str, method: str) -> None:
|
|
71
|
-
with self.clone() as fetch_listing:
|
|
72
|
-
if start_prefix:
|
|
73
|
-
start_prefix = start_prefix.rstrip("/")
|
|
74
|
-
try:
|
|
75
|
-
async for entries in fetch_listing.client.scandir(
|
|
76
|
-
start_prefix, method=method
|
|
77
|
-
):
|
|
78
|
-
fetch_listing.insert_entries(entries)
|
|
79
|
-
if len(entries) > 1:
|
|
80
|
-
fetch_listing.metastore.update_last_inserted_at()
|
|
81
|
-
finally:
|
|
82
|
-
fetch_listing.insert_entries_done()
|
|
83
|
-
|
|
84
|
-
def insert_entry(self, entry: File) -> None:
|
|
85
|
-
self.insert_entries([entry])
|
|
86
|
-
|
|
87
|
-
def insert_entries(self, entries: Iterable[File]) -> None:
|
|
88
|
-
self.warehouse.insert_rows(
|
|
89
|
-
self.dataset_rows.get_table(),
|
|
90
|
-
self.warehouse.prepare_entries(entries),
|
|
60
|
+
return self.warehouse.dataset_rows(
|
|
61
|
+
self.dataset, self.dataset.latest_version, object_name=self.object_name
|
|
91
62
|
)
|
|
92
63
|
|
|
93
|
-
def insert_entries_done(self) -> None:
|
|
94
|
-
self.warehouse.insert_rows_done(self.dataset_rows.get_table())
|
|
95
|
-
|
|
96
64
|
def expand_path(self, path, use_glob=True) -> list[Node]:
|
|
97
65
|
if use_glob and glob.has_magic(path):
|
|
98
66
|
return self.warehouse.expand_path(self.dataset_rows, path)
|
|
@@ -200,25 +168,31 @@ class Listing:
|
|
|
200
168
|
conds = []
|
|
201
169
|
if names:
|
|
202
170
|
for name in names:
|
|
203
|
-
conds.append(
|
|
171
|
+
conds.append(
|
|
172
|
+
pathfunc.name(Column(dr.col_name("path"))).op("GLOB")(name)
|
|
173
|
+
)
|
|
204
174
|
if inames:
|
|
205
175
|
for iname in inames:
|
|
206
176
|
conds.append(
|
|
207
|
-
func.lower(pathfunc.name(Column("path"))).op("GLOB")(
|
|
177
|
+
func.lower(pathfunc.name(Column(dr.col_name("path")))).op("GLOB")(
|
|
178
|
+
iname.lower()
|
|
179
|
+
)
|
|
208
180
|
)
|
|
209
181
|
if paths:
|
|
210
182
|
for path in paths:
|
|
211
|
-
conds.append(Column("path").op("GLOB")(path))
|
|
183
|
+
conds.append(Column(dr.col_name("path")).op("GLOB")(path))
|
|
212
184
|
if ipaths:
|
|
213
185
|
for ipath in ipaths:
|
|
214
|
-
conds.append(
|
|
186
|
+
conds.append(
|
|
187
|
+
func.lower(Column(dr.col_name("path"))).op("GLOB")(ipath.lower())
|
|
188
|
+
)
|
|
215
189
|
|
|
216
190
|
if size is not None:
|
|
217
191
|
size_limit = suffix_to_number(size)
|
|
218
192
|
if size_limit >= 0:
|
|
219
|
-
conds.append(Column("size") >= size_limit)
|
|
193
|
+
conds.append(Column(dr.col_name("size")) >= size_limit)
|
|
220
194
|
else:
|
|
221
|
-
conds.append(Column("size") <= -size_limit)
|
|
195
|
+
conds.append(Column(dr.col_name("size")) <= -size_limit)
|
|
222
196
|
|
|
223
197
|
return self.warehouse.find(
|
|
224
198
|
dr,
|
datachain/query/dataset.py
CHANGED
|
@@ -10,6 +10,7 @@ from abc import ABC, abstractmethod
|
|
|
10
10
|
from collections.abc import Generator, Iterable, Iterator, Sequence
|
|
11
11
|
from copy import copy
|
|
12
12
|
from functools import wraps
|
|
13
|
+
from secrets import token_hex
|
|
13
14
|
from typing import (
|
|
14
15
|
TYPE_CHECKING,
|
|
15
16
|
Any,
|
|
@@ -173,10 +174,10 @@ class QueryStep(StartingStep):
|
|
|
173
174
|
return sqlalchemy.select(*columns)
|
|
174
175
|
|
|
175
176
|
dataset = self.catalog.get_dataset(self.dataset_name)
|
|
176
|
-
|
|
177
|
+
dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
|
|
177
178
|
|
|
178
179
|
return step_result(
|
|
179
|
-
q,
|
|
180
|
+
q, dr.columns, dependencies=[(self.dataset_name, self.dataset_version)]
|
|
180
181
|
)
|
|
181
182
|
|
|
182
183
|
|
|
@@ -720,10 +721,17 @@ class SQLMutate(SQLClause):
|
|
|
720
721
|
|
|
721
722
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
722
723
|
original_subquery = query.subquery()
|
|
724
|
+
to_mutate = {c.name for c in self.args}
|
|
725
|
+
|
|
726
|
+
prefix = f"mutate{token_hex(8)}_"
|
|
727
|
+
cols = [
|
|
728
|
+
c.label(prefix + c.name) if c.name in to_mutate else c
|
|
729
|
+
for c in original_subquery.c
|
|
730
|
+
]
|
|
723
731
|
# this is needed for new column to be used in clauses
|
|
724
732
|
# like ORDER BY, otherwise new column is not recognized
|
|
725
733
|
subquery = (
|
|
726
|
-
sqlalchemy.select(*
|
|
734
|
+
sqlalchemy.select(*cols, *self.args)
|
|
727
735
|
.select_from(original_subquery)
|
|
728
736
|
.subquery()
|
|
729
737
|
)
|
datachain/remote/studio.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
from collections.abc import Iterable, Iterator
|
|
4
5
|
from datetime import datetime, timedelta, timezone
|
|
5
6
|
from struct import unpack
|
|
@@ -10,8 +11,10 @@ from typing import (
|
|
|
10
11
|
TypeVar,
|
|
11
12
|
)
|
|
12
13
|
|
|
14
|
+
from datachain.config import Config
|
|
13
15
|
from datachain.dataset import DatasetStats
|
|
14
|
-
from datachain.
|
|
16
|
+
from datachain.error import DataChainError
|
|
17
|
+
from datachain.utils import STUDIO_URL, retry_with_backoff
|
|
15
18
|
|
|
16
19
|
T = TypeVar("T")
|
|
17
20
|
LsData = Optional[list[dict[str, Any]]]
|
|
@@ -54,14 +57,54 @@ class Response(Generic[T]):
|
|
|
54
57
|
|
|
55
58
|
|
|
56
59
|
class StudioClient:
|
|
57
|
-
def __init__(
|
|
58
|
-
self, url: str, username: str, token: str, timeout: float = 3600.0
|
|
59
|
-
) -> None:
|
|
60
|
+
def __init__(self, timeout: float = 3600.0, team: Optional[str] = None) -> None:
|
|
60
61
|
self._check_dependencies()
|
|
61
|
-
self.url = url.rstrip("/")
|
|
62
|
-
self.username = username
|
|
63
|
-
self.token = token
|
|
64
62
|
self.timeout = timeout
|
|
63
|
+
self._config = None
|
|
64
|
+
self._team = team
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def token(self) -> str:
|
|
68
|
+
token = os.environ.get("DVC_STUDIO_TOKEN") or self.config.get("token")
|
|
69
|
+
|
|
70
|
+
if not token:
|
|
71
|
+
raise DataChainError(
|
|
72
|
+
"Studio token is not set. Use `datachain studio login` "
|
|
73
|
+
"or environment variable `DVC_STUDIO_TOKEN` to set it."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return token
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def url(self) -> str:
|
|
80
|
+
return (
|
|
81
|
+
os.environ.get("DVC_STUDIO_URL") or self.config.get("url") or STUDIO_URL
|
|
82
|
+
) + "/api"
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def config(self) -> dict:
|
|
86
|
+
if self._config is None:
|
|
87
|
+
self._config = Config().read().get("studio", {})
|
|
88
|
+
return self._config # type: ignore [return-value]
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def team(self) -> str:
|
|
92
|
+
if self._team is None:
|
|
93
|
+
self._team = self._get_team()
|
|
94
|
+
return self._team
|
|
95
|
+
|
|
96
|
+
def _get_team(self) -> str:
|
|
97
|
+
team = os.environ.get("DVC_STUDIO_TEAM") or self.config.get("team")
|
|
98
|
+
|
|
99
|
+
if not team:
|
|
100
|
+
raise DataChainError(
|
|
101
|
+
"Studio team is not set. "
|
|
102
|
+
"Use `datachain studio team <team_name>` "
|
|
103
|
+
"or environment variable `DVC_STUDIO_TEAM` to set it."
|
|
104
|
+
"You can also set it in the config file as team under studio."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return team
|
|
65
108
|
|
|
66
109
|
def _check_dependencies(self) -> None:
|
|
67
110
|
try:
|
|
@@ -80,7 +123,7 @@ class StudioClient:
|
|
|
80
123
|
|
|
81
124
|
response = requests.post(
|
|
82
125
|
f"{self.url}/{route}",
|
|
83
|
-
json={**data, "team_name": self.
|
|
126
|
+
json={**data, "team_name": self.team},
|
|
84
127
|
headers={
|
|
85
128
|
"Content-Type": "application/json",
|
|
86
129
|
"Authorization": f"token {self.token}",
|
|
@@ -108,7 +151,7 @@ class StudioClient:
|
|
|
108
151
|
|
|
109
152
|
response = requests.post(
|
|
110
153
|
f"{self.url}/{route}",
|
|
111
|
-
json={**data, "team_name": self.
|
|
154
|
+
json={**data, "team_name": self.team},
|
|
112
155
|
headers={
|
|
113
156
|
"Content-Type": "application/json",
|
|
114
157
|
"Authorization": f"token {self.token}",
|
|
@@ -174,6 +217,9 @@ class StudioClient:
|
|
|
174
217
|
response = self._send_request_msgpack("ls", {"source": path})
|
|
175
218
|
yield path, response
|
|
176
219
|
|
|
220
|
+
def ls_datasets(self) -> Response[LsData]:
|
|
221
|
+
return self._send_request("datachain/ls-datasets", {})
|
|
222
|
+
|
|
177
223
|
def dataset_info(self, name: str) -> Response[DatasetInfoData]:
|
|
178
224
|
def _parse_dataset_info(dataset_info):
|
|
179
225
|
_parse_dates(dataset_info, ["created_at", "finished_at"])
|
|
@@ -182,7 +228,7 @@ class StudioClient:
|
|
|
182
228
|
|
|
183
229
|
return dataset_info
|
|
184
230
|
|
|
185
|
-
response = self._send_request("dataset-info", {"dataset_name": name})
|
|
231
|
+
response = self._send_request("datachain/dataset-info", {"dataset_name": name})
|
|
186
232
|
if response.ok:
|
|
187
233
|
response.data = _parse_dataset_info(response.data)
|
|
188
234
|
return response
|
|
@@ -192,13 +238,14 @@ class StudioClient:
|
|
|
192
238
|
) -> Response[DatasetRowsData]:
|
|
193
239
|
req_data = {"dataset_name": name, "dataset_version": version}
|
|
194
240
|
return self._send_request_msgpack(
|
|
195
|
-
"dataset-rows",
|
|
241
|
+
"datachain/dataset-rows",
|
|
196
242
|
{**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
|
|
197
243
|
)
|
|
198
244
|
|
|
199
245
|
def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
|
|
200
246
|
response = self._send_request(
|
|
201
|
-
"dataset-stats",
|
|
247
|
+
"datachain/dataset-stats",
|
|
248
|
+
{"dataset_name": name, "dataset_version": version},
|
|
202
249
|
)
|
|
203
250
|
if response.ok:
|
|
204
251
|
response.data = DatasetStats(**response.data)
|
|
@@ -208,12 +255,14 @@ class StudioClient:
|
|
|
208
255
|
self, name: str, version: int
|
|
209
256
|
) -> Response[DatasetExportSignedUrls]:
|
|
210
257
|
return self._send_request(
|
|
211
|
-
"dataset-export",
|
|
258
|
+
"datachain/dataset-export",
|
|
259
|
+
{"dataset_name": name, "dataset_version": version},
|
|
212
260
|
)
|
|
213
261
|
|
|
214
262
|
def dataset_export_status(
|
|
215
263
|
self, name: str, version: int
|
|
216
264
|
) -> Response[DatasetExportStatus]:
|
|
217
265
|
return self._send_request(
|
|
218
|
-
"dataset-export-status",
|
|
266
|
+
"datachain/dataset-export-status",
|
|
267
|
+
{"dataset_name": name, "dataset_version": version},
|
|
219
268
|
)
|
datachain/studio.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from datachain.catalog.catalog import raise_remote_error
|
|
5
|
+
from datachain.config import Config, ConfigLevel
|
|
6
|
+
from datachain.error import DataChainError
|
|
7
|
+
from datachain.remote.studio import StudioClient
|
|
8
|
+
from datachain.utils import STUDIO_URL
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from argparse import Namespace
|
|
12
|
+
|
|
13
|
+
POST_LOGIN_MESSAGE = (
|
|
14
|
+
"Once you've logged in, return here "
|
|
15
|
+
"and you'll be ready to start using DataChain with Studio."
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def process_studio_cli_args(args: "Namespace"):
|
|
20
|
+
if args.cmd == "login":
|
|
21
|
+
return login(args)
|
|
22
|
+
if args.cmd == "logout":
|
|
23
|
+
return logout()
|
|
24
|
+
if args.cmd == "token":
|
|
25
|
+
return token()
|
|
26
|
+
if args.cmd == "datasets":
|
|
27
|
+
return list_datasets(args)
|
|
28
|
+
if args.cmd == "team":
|
|
29
|
+
return set_team(args)
|
|
30
|
+
raise DataChainError(f"Unknown command '{args.cmd}'.")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def set_team(args: "Namespace"):
|
|
34
|
+
level = ConfigLevel.GLOBAL if args.__dict__.get("global") else ConfigLevel.LOCAL
|
|
35
|
+
config = Config(level)
|
|
36
|
+
with config.edit() as conf:
|
|
37
|
+
studio_conf = conf.get("studio", {})
|
|
38
|
+
studio_conf["team"] = args.team_name
|
|
39
|
+
conf["studio"] = studio_conf
|
|
40
|
+
|
|
41
|
+
print(f"Set default team to '{args.team_name}' in {config.config_file()}")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def login(args: "Namespace"):
|
|
45
|
+
from dvc_studio_client.auth import StudioAuthError, get_access_token
|
|
46
|
+
|
|
47
|
+
config = Config().read().get("studio", {})
|
|
48
|
+
name = args.name
|
|
49
|
+
hostname = (
|
|
50
|
+
args.hostname
|
|
51
|
+
or os.environ.get("DVC_STUDIO_URL")
|
|
52
|
+
or config.get("url")
|
|
53
|
+
or STUDIO_URL
|
|
54
|
+
)
|
|
55
|
+
scopes = args.scopes
|
|
56
|
+
|
|
57
|
+
if config.get("url", hostname) == hostname and "token" in config:
|
|
58
|
+
raise DataChainError(
|
|
59
|
+
"Token already exists. "
|
|
60
|
+
"To login with a different token, "
|
|
61
|
+
"logout using `datachain studio logout`."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
open_browser = not args.no_open
|
|
65
|
+
try:
|
|
66
|
+
_, access_token = get_access_token(
|
|
67
|
+
token_name=name,
|
|
68
|
+
hostname=hostname,
|
|
69
|
+
scopes=scopes,
|
|
70
|
+
open_browser=open_browser,
|
|
71
|
+
client_name="DataChain",
|
|
72
|
+
post_login_message=POST_LOGIN_MESSAGE,
|
|
73
|
+
)
|
|
74
|
+
except StudioAuthError as exc:
|
|
75
|
+
raise DataChainError(f"Failed to authenticate with Studio: {exc}") from exc
|
|
76
|
+
|
|
77
|
+
config_path = save_config(hostname, access_token)
|
|
78
|
+
print(f"Authentication complete. Saved token to {config_path}.")
|
|
79
|
+
return 0
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def logout():
|
|
83
|
+
with Config(ConfigLevel.GLOBAL).edit() as conf:
|
|
84
|
+
token = conf.get("studio", {}).get("token")
|
|
85
|
+
if not token:
|
|
86
|
+
raise DataChainError(
|
|
87
|
+
"Not logged in to Studio. Log in with 'datachain studio login'."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
del conf["studio"]["token"]
|
|
91
|
+
|
|
92
|
+
print("Logged out from Studio. (you can log back in with 'datachain studio login')")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def token():
|
|
96
|
+
config = Config().read().get("studio", {})
|
|
97
|
+
token = config.get("token")
|
|
98
|
+
if not token:
|
|
99
|
+
raise DataChainError(
|
|
100
|
+
"Not logged in to Studio. Log in with 'datachain studio login'."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
print(token)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def list_datasets(args: "Namespace"):
|
|
107
|
+
client = StudioClient(team=args.team)
|
|
108
|
+
response = client.ls_datasets()
|
|
109
|
+
if not response.ok:
|
|
110
|
+
raise_remote_error(response.message)
|
|
111
|
+
if not response.data:
|
|
112
|
+
print("No datasets found.")
|
|
113
|
+
return
|
|
114
|
+
for d in response.data:
|
|
115
|
+
name = d.get("name")
|
|
116
|
+
for v in d.get("versions", []):
|
|
117
|
+
version = v.get("version")
|
|
118
|
+
print(f"{name} (v{version})")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def save_config(hostname, token):
|
|
122
|
+
config = Config(ConfigLevel.GLOBAL)
|
|
123
|
+
with config.edit() as conf:
|
|
124
|
+
studio_conf = conf.get("studio", {})
|
|
125
|
+
studio_conf["url"] = hostname
|
|
126
|
+
studio_conf["token"] = token
|
|
127
|
+
conf["studio"] = studio_conf
|
|
128
|
+
|
|
129
|
+
return config.config_file()
|
datachain/utils.py
CHANGED
|
@@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
|
|
15
15
|
from uuid import UUID
|
|
16
16
|
|
|
17
17
|
import cloudpickle
|
|
18
|
+
import platformdirs
|
|
18
19
|
from dateutil import tz
|
|
19
20
|
from dateutil.parser import isoparse
|
|
20
21
|
from pydantic import BaseModel
|
|
@@ -25,6 +26,13 @@ if TYPE_CHECKING:
|
|
|
25
26
|
NUL = b"\0"
|
|
26
27
|
TIME_ZERO = datetime.fromtimestamp(0, tz=timezone.utc)
|
|
27
28
|
|
|
29
|
+
APPNAME = "datachain"
|
|
30
|
+
APPAUTHOR = "iterative"
|
|
31
|
+
ENV_DATACHAIN_SYSTEM_CONFIG_DIR = "DATACHAIN_SYSTEM_CONFIG_DIR"
|
|
32
|
+
ENV_DATACHAIN_GLOBAL_CONFIG_DIR = "DATACHAIN_GLOBAL_CONFIG_DIR"
|
|
33
|
+
STUDIO_URL = "https://studio.dvc.ai"
|
|
34
|
+
|
|
35
|
+
|
|
28
36
|
T = TypeVar("T", bound="DataChainDir")
|
|
29
37
|
|
|
30
38
|
|
|
@@ -33,6 +41,7 @@ class DataChainDir:
|
|
|
33
41
|
CACHE = "cache"
|
|
34
42
|
TMP = "tmp"
|
|
35
43
|
DB = "db"
|
|
44
|
+
CONFIG = "config"
|
|
36
45
|
ENV_VAR = "DATACHAIN_DIR"
|
|
37
46
|
ENV_VAR_DATACHAIN_ROOT = "DATACHAIN_ROOT_DIR"
|
|
38
47
|
|
|
@@ -42,6 +51,7 @@ class DataChainDir:
|
|
|
42
51
|
cache: Optional[str] = None,
|
|
43
52
|
tmp: Optional[str] = None,
|
|
44
53
|
db: Optional[str] = None,
|
|
54
|
+
config: Optional[str] = None,
|
|
45
55
|
) -> None:
|
|
46
56
|
self.root = osp.abspath(root) if root is not None else self.default_root()
|
|
47
57
|
self.cache = (
|
|
@@ -51,12 +61,24 @@ class DataChainDir:
|
|
|
51
61
|
osp.abspath(tmp) if tmp is not None else osp.join(self.root, self.TMP)
|
|
52
62
|
)
|
|
53
63
|
self.db = osp.abspath(db) if db is not None else osp.join(self.root, self.DB)
|
|
64
|
+
self.config = (
|
|
65
|
+
osp.abspath(config)
|
|
66
|
+
if config is not None
|
|
67
|
+
else osp.join(self.root, self.CONFIG)
|
|
68
|
+
)
|
|
69
|
+
self.config = (
|
|
70
|
+
osp.abspath(config)
|
|
71
|
+
if config is not None
|
|
72
|
+
else osp.join(self.root, self.CONFIG)
|
|
73
|
+
)
|
|
54
74
|
|
|
55
75
|
def init(self):
|
|
56
76
|
os.makedirs(self.root, exist_ok=True)
|
|
57
77
|
os.makedirs(self.cache, exist_ok=True)
|
|
58
78
|
os.makedirs(self.tmp, exist_ok=True)
|
|
59
79
|
os.makedirs(osp.split(self.db)[0], exist_ok=True)
|
|
80
|
+
os.makedirs(osp.split(self.config)[0], exist_ok=True)
|
|
81
|
+
os.makedirs(osp.split(self.config)[0], exist_ok=True)
|
|
60
82
|
|
|
61
83
|
@classmethod
|
|
62
84
|
def default_root(cls) -> str:
|
|
@@ -82,6 +104,18 @@ class DataChainDir:
|
|
|
82
104
|
return instance
|
|
83
105
|
|
|
84
106
|
|
|
107
|
+
def system_config_dir():
|
|
108
|
+
return os.getenv(ENV_DATACHAIN_SYSTEM_CONFIG_DIR) or platformdirs.site_config_dir(
|
|
109
|
+
APPNAME, APPAUTHOR
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def global_config_dir():
|
|
114
|
+
return os.getenv(ENV_DATACHAIN_GLOBAL_CONFIG_DIR) or platformdirs.user_config_dir(
|
|
115
|
+
APPNAME, APPAUTHOR
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
85
119
|
def human_time_to_int(time: str) -> Optional[int]:
|
|
86
120
|
if not time:
|
|
87
121
|
return None
|
|
@@ -421,3 +455,27 @@ def env2bool(var, undefined=False):
|
|
|
421
455
|
if var is None:
|
|
422
456
|
return undefined
|
|
423
457
|
return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def nested_dict_path_set(
|
|
461
|
+
data: dict[str, Any], path: Sequence[str], value: Any
|
|
462
|
+
) -> dict[str, Any]:
|
|
463
|
+
"""Sets a value inside a nested dict based on the list of dict keys as a path,
|
|
464
|
+
and will create sub-dicts as needed to set the value."""
|
|
465
|
+
sub_data = data
|
|
466
|
+
for element in path[:-1]:
|
|
467
|
+
if element not in sub_data:
|
|
468
|
+
sub_data[element] = {}
|
|
469
|
+
sub_data = sub_data[element]
|
|
470
|
+
sub_data[path[len(path) - 1]] = value
|
|
471
|
+
return data
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def row_to_nested_dict(
|
|
475
|
+
headers: Iterable[Sequence[str]], row: Iterable[Any]
|
|
476
|
+
) -> dict[str, Any]:
|
|
477
|
+
"""Converts a row to a nested dict based on the provided headers."""
|
|
478
|
+
result: dict[str, Any] = {}
|
|
479
|
+
for h, v in zip(headers, row):
|
|
480
|
+
nested_dict_path_set(result, h, v)
|
|
481
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -19,7 +19,7 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: pyyaml
|
|
20
20
|
Requires-Dist: tomlkit
|
|
21
21
|
Requires-Dist: tqdm
|
|
22
|
-
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: numpy <3,>=1
|
|
23
23
|
Requires-Dist: pandas >=2.0.0
|
|
24
24
|
Requires-Dist: pyarrow
|
|
25
25
|
Requires-Dist: typing-extensions
|
|
@@ -38,15 +38,16 @@ Requires-Dist: orjson >=3.10.5
|
|
|
38
38
|
Requires-Dist: pydantic <3,>=2
|
|
39
39
|
Requires-Dist: jmespath >=1.0
|
|
40
40
|
Requires-Dist: datamodel-code-generator >=0.25
|
|
41
|
-
Requires-Dist: Pillow <
|
|
41
|
+
Requires-Dist: Pillow <12,>=10.0.0
|
|
42
42
|
Requires-Dist: msgpack <2,>=1.0.4
|
|
43
43
|
Requires-Dist: psutil
|
|
44
44
|
Requires-Dist: huggingface-hub
|
|
45
45
|
Requires-Dist: iterative-telemetry >=0.0.9
|
|
46
|
-
Requires-Dist:
|
|
46
|
+
Requires-Dist: platformdirs
|
|
47
|
+
Requires-Dist: dvc-studio-client <1,>=0.21
|
|
47
48
|
Provides-Extra: dev
|
|
48
49
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
49
|
-
Requires-Dist: mypy ==1.12.
|
|
50
|
+
Requires-Dist: mypy ==1.12.1 ; extra == 'dev'
|
|
50
51
|
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
51
52
|
Requires-Dist: types-pytz ; extra == 'dev'
|
|
52
53
|
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
@@ -63,7 +64,7 @@ Requires-Dist: datachain[tests] ; extra == 'examples'
|
|
|
63
64
|
Requires-Dist: numpy <2,>=1 ; extra == 'examples'
|
|
64
65
|
Requires-Dist: defusedxml ; extra == 'examples'
|
|
65
66
|
Requires-Dist: accelerate ; extra == 'examples'
|
|
66
|
-
Requires-Dist: unstructured[embed-huggingface,pdf] ; extra == 'examples'
|
|
67
|
+
Requires-Dist: unstructured[embed-huggingface,pdf] <0.16.0 ; extra == 'examples'
|
|
67
68
|
Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
|
|
68
69
|
Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
|
|
69
70
|
Requires-Dist: onnx ==1.16.1 ; extra == 'examples'
|