datachain 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/catalog/catalog.py +62 -228
- datachain/cli.py +136 -22
- datachain/client/fsspec.py +9 -0
- datachain/client/local.py +11 -32
- datachain/config.py +126 -51
- datachain/data_storage/schema.py +66 -33
- datachain/data_storage/sqlite.py +12 -4
- datachain/data_storage/warehouse.py +101 -129
- datachain/lib/convert/sql_to_python.py +8 -12
- datachain/lib/dc.py +275 -80
- datachain/lib/func/__init__.py +32 -0
- datachain/lib/func/aggregate.py +353 -0
- datachain/lib/func/func.py +152 -0
- datachain/lib/listing.py +6 -21
- datachain/lib/listing_info.py +4 -0
- datachain/lib/signal_schema.py +17 -8
- datachain/lib/udf.py +3 -3
- datachain/lib/utils.py +5 -0
- datachain/listing.py +22 -48
- datachain/query/__init__.py +1 -2
- datachain/query/batch.py +0 -1
- datachain/query/dataset.py +33 -46
- datachain/query/schema.py +1 -61
- datachain/query/session.py +33 -25
- datachain/remote/studio.py +63 -14
- datachain/sql/functions/__init__.py +1 -1
- datachain/sql/functions/aggregate.py +47 -0
- datachain/sql/functions/array.py +0 -8
- datachain/sql/sqlite/base.py +20 -2
- datachain/studio.py +129 -0
- datachain/utils.py +58 -0
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/METADATA +7 -6
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/RECORD +38 -33
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/WHEEL +1 -1
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/LICENSE +0 -0
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/top_level.txt +0 -0
datachain/remote/studio.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
from collections.abc import Iterable, Iterator
|
|
4
5
|
from datetime import datetime, timedelta, timezone
|
|
5
6
|
from struct import unpack
|
|
@@ -10,8 +11,10 @@ from typing import (
|
|
|
10
11
|
TypeVar,
|
|
11
12
|
)
|
|
12
13
|
|
|
14
|
+
from datachain.config import Config
|
|
13
15
|
from datachain.dataset import DatasetStats
|
|
14
|
-
from datachain.
|
|
16
|
+
from datachain.error import DataChainError
|
|
17
|
+
from datachain.utils import STUDIO_URL, retry_with_backoff
|
|
15
18
|
|
|
16
19
|
T = TypeVar("T")
|
|
17
20
|
LsData = Optional[list[dict[str, Any]]]
|
|
@@ -54,14 +57,54 @@ class Response(Generic[T]):
|
|
|
54
57
|
|
|
55
58
|
|
|
56
59
|
class StudioClient:
|
|
57
|
-
def __init__(
|
|
58
|
-
self, url: str, username: str, token: str, timeout: float = 3600.0
|
|
59
|
-
) -> None:
|
|
60
|
+
def __init__(self, timeout: float = 3600.0, team: Optional[str] = None) -> None:
|
|
60
61
|
self._check_dependencies()
|
|
61
|
-
self.url = url.rstrip("/")
|
|
62
|
-
self.username = username
|
|
63
|
-
self.token = token
|
|
64
62
|
self.timeout = timeout
|
|
63
|
+
self._config = None
|
|
64
|
+
self._team = team
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def token(self) -> str:
|
|
68
|
+
token = os.environ.get("DVC_STUDIO_TOKEN") or self.config.get("token")
|
|
69
|
+
|
|
70
|
+
if not token:
|
|
71
|
+
raise DataChainError(
|
|
72
|
+
"Studio token is not set. Use `datachain studio login` "
|
|
73
|
+
"or environment variable `DVC_STUDIO_TOKEN` to set it."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return token
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def url(self) -> str:
|
|
80
|
+
return (
|
|
81
|
+
os.environ.get("DVC_STUDIO_URL") or self.config.get("url") or STUDIO_URL
|
|
82
|
+
) + "/api"
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def config(self) -> dict:
|
|
86
|
+
if self._config is None:
|
|
87
|
+
self._config = Config().read().get("studio", {})
|
|
88
|
+
return self._config # type: ignore [return-value]
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def team(self) -> str:
|
|
92
|
+
if self._team is None:
|
|
93
|
+
self._team = self._get_team()
|
|
94
|
+
return self._team
|
|
95
|
+
|
|
96
|
+
def _get_team(self) -> str:
|
|
97
|
+
team = os.environ.get("DVC_STUDIO_TEAM") or self.config.get("team")
|
|
98
|
+
|
|
99
|
+
if not team:
|
|
100
|
+
raise DataChainError(
|
|
101
|
+
"Studio team is not set. "
|
|
102
|
+
"Use `datachain studio team <team_name>` "
|
|
103
|
+
"or environment variable `DVC_STUDIO_TEAM` to set it."
|
|
104
|
+
"You can also set it in the config file as team under studio."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return team
|
|
65
108
|
|
|
66
109
|
def _check_dependencies(self) -> None:
|
|
67
110
|
try:
|
|
@@ -80,7 +123,7 @@ class StudioClient:
|
|
|
80
123
|
|
|
81
124
|
response = requests.post(
|
|
82
125
|
f"{self.url}/{route}",
|
|
83
|
-
json={**data, "team_name": self.
|
|
126
|
+
json={**data, "team_name": self.team},
|
|
84
127
|
headers={
|
|
85
128
|
"Content-Type": "application/json",
|
|
86
129
|
"Authorization": f"token {self.token}",
|
|
@@ -108,7 +151,7 @@ class StudioClient:
|
|
|
108
151
|
|
|
109
152
|
response = requests.post(
|
|
110
153
|
f"{self.url}/{route}",
|
|
111
|
-
json={**data, "team_name": self.
|
|
154
|
+
json={**data, "team_name": self.team},
|
|
112
155
|
headers={
|
|
113
156
|
"Content-Type": "application/json",
|
|
114
157
|
"Authorization": f"token {self.token}",
|
|
@@ -174,6 +217,9 @@ class StudioClient:
|
|
|
174
217
|
response = self._send_request_msgpack("ls", {"source": path})
|
|
175
218
|
yield path, response
|
|
176
219
|
|
|
220
|
+
def ls_datasets(self) -> Response[LsData]:
|
|
221
|
+
return self._send_request("datachain/ls-datasets", {})
|
|
222
|
+
|
|
177
223
|
def dataset_info(self, name: str) -> Response[DatasetInfoData]:
|
|
178
224
|
def _parse_dataset_info(dataset_info):
|
|
179
225
|
_parse_dates(dataset_info, ["created_at", "finished_at"])
|
|
@@ -182,7 +228,7 @@ class StudioClient:
|
|
|
182
228
|
|
|
183
229
|
return dataset_info
|
|
184
230
|
|
|
185
|
-
response = self._send_request("dataset-info", {"dataset_name": name})
|
|
231
|
+
response = self._send_request("datachain/dataset-info", {"dataset_name": name})
|
|
186
232
|
if response.ok:
|
|
187
233
|
response.data = _parse_dataset_info(response.data)
|
|
188
234
|
return response
|
|
@@ -192,13 +238,14 @@ class StudioClient:
|
|
|
192
238
|
) -> Response[DatasetRowsData]:
|
|
193
239
|
req_data = {"dataset_name": name, "dataset_version": version}
|
|
194
240
|
return self._send_request_msgpack(
|
|
195
|
-
"dataset-rows",
|
|
241
|
+
"datachain/dataset-rows",
|
|
196
242
|
{**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
|
|
197
243
|
)
|
|
198
244
|
|
|
199
245
|
def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
|
|
200
246
|
response = self._send_request(
|
|
201
|
-
"dataset-stats",
|
|
247
|
+
"datachain/dataset-stats",
|
|
248
|
+
{"dataset_name": name, "dataset_version": version},
|
|
202
249
|
)
|
|
203
250
|
if response.ok:
|
|
204
251
|
response.data = DatasetStats(**response.data)
|
|
@@ -208,12 +255,14 @@ class StudioClient:
|
|
|
208
255
|
self, name: str, version: int
|
|
209
256
|
) -> Response[DatasetExportSignedUrls]:
|
|
210
257
|
return self._send_request(
|
|
211
|
-
"dataset-export",
|
|
258
|
+
"datachain/dataset-export",
|
|
259
|
+
{"dataset_name": name, "dataset_version": version},
|
|
212
260
|
)
|
|
213
261
|
|
|
214
262
|
def dataset_export_status(
|
|
215
263
|
self, name: str, version: int
|
|
216
264
|
) -> Response[DatasetExportStatus]:
|
|
217
265
|
return self._send_request(
|
|
218
|
-
"dataset-export-status",
|
|
266
|
+
"datachain/dataset-export-status",
|
|
267
|
+
{"dataset_name": name, "dataset_version": version},
|
|
219
268
|
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from sqlalchemy.sql.functions import GenericFunction, ReturnTypeFromArgs
|
|
2
|
+
|
|
3
|
+
from datachain.sql.types import Float, String
|
|
4
|
+
from datachain.sql.utils import compiler_not_implemented
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class avg(GenericFunction): # noqa: N801
|
|
8
|
+
"""
|
|
9
|
+
Returns the average of the column.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
type = Float()
|
|
13
|
+
package = "array"
|
|
14
|
+
name = "avg"
|
|
15
|
+
inherit_cache = True
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class group_concat(GenericFunction): # noqa: N801
|
|
19
|
+
"""
|
|
20
|
+
Returns the concatenated string of the column.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
type = String()
|
|
24
|
+
package = "array"
|
|
25
|
+
name = "group_concat"
|
|
26
|
+
inherit_cache = True
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class any_value(ReturnTypeFromArgs): # noqa: N801
|
|
30
|
+
"""
|
|
31
|
+
Returns first value of the column.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
inherit_cache = True
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class collect(ReturnTypeFromArgs): # noqa: N801
|
|
38
|
+
"""
|
|
39
|
+
Returns an array of the column.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
inherit_cache = True
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
compiler_not_implemented(avg)
|
|
46
|
+
compiler_not_implemented(group_concat)
|
|
47
|
+
compiler_not_implemented(any_value)
|
datachain/sql/functions/array.py
CHANGED
|
@@ -44,15 +44,7 @@ class sip_hash_64(GenericFunction): # noqa: N801
|
|
|
44
44
|
inherit_cache = True
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
class avg(GenericFunction): # noqa: N801
|
|
48
|
-
type = Float()
|
|
49
|
-
package = "array"
|
|
50
|
-
name = "avg"
|
|
51
|
-
inherit_cache = True
|
|
52
|
-
|
|
53
|
-
|
|
54
47
|
compiler_not_implemented(cosine_distance)
|
|
55
48
|
compiler_not_implemented(euclidean_distance)
|
|
56
49
|
compiler_not_implemented(length)
|
|
57
50
|
compiler_not_implemented(sip_hash_64)
|
|
58
|
-
compiler_not_implemented(avg)
|
datachain/sql/sqlite/base.py
CHANGED
|
@@ -14,7 +14,7 @@ from sqlalchemy.sql.elements import literal
|
|
|
14
14
|
from sqlalchemy.sql.expression import case
|
|
15
15
|
from sqlalchemy.sql.functions import func
|
|
16
16
|
|
|
17
|
-
from datachain.sql.functions import array, conditional, random, string
|
|
17
|
+
from datachain.sql.functions import aggregate, array, conditional, random, string
|
|
18
18
|
from datachain.sql.functions import path as sql_path
|
|
19
19
|
from datachain.sql.selectable import Values, base_values_compiler
|
|
20
20
|
from datachain.sql.sqlite.types import (
|
|
@@ -84,7 +84,10 @@ def setup():
|
|
|
84
84
|
compiles(conditional.least, "sqlite")(compile_least)
|
|
85
85
|
compiles(Values, "sqlite")(compile_values)
|
|
86
86
|
compiles(random.rand, "sqlite")(compile_rand)
|
|
87
|
-
compiles(
|
|
87
|
+
compiles(aggregate.avg, "sqlite")(compile_avg)
|
|
88
|
+
compiles(aggregate.group_concat, "sqlite")(compile_group_concat)
|
|
89
|
+
compiles(aggregate.any_value, "sqlite")(compile_any_value)
|
|
90
|
+
compiles(aggregate.collect, "sqlite")(compile_collect)
|
|
88
91
|
|
|
89
92
|
if load_usearch_extension(sqlite3.connect(":memory:")):
|
|
90
93
|
compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
|
|
@@ -400,6 +403,21 @@ def compile_avg(element, compiler, **kwargs):
|
|
|
400
403
|
return compiler.process(func.avg(*element.clauses.clauses), **kwargs)
|
|
401
404
|
|
|
402
405
|
|
|
406
|
+
def compile_group_concat(element, compiler, **kwargs):
|
|
407
|
+
return compiler.process(func.aggregate_strings(*element.clauses.clauses), **kwargs)
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def compile_any_value(element, compiler, **kwargs):
|
|
411
|
+
# use bare column to return any value from the group,
|
|
412
|
+
# this is documented behavior for sqlite,
|
|
413
|
+
# see https://www.sqlite.org/lang_select.html#bare_columns_in_an_aggregate_query
|
|
414
|
+
return compiler.process(*element.clauses.clauses, **kwargs)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def compile_collect(element, compiler, **kwargs):
|
|
418
|
+
return compiler.process(func.json_group_array(*element.clauses.clauses), **kwargs)
|
|
419
|
+
|
|
420
|
+
|
|
403
421
|
def load_usearch_extension(conn) -> bool:
|
|
404
422
|
try:
|
|
405
423
|
# usearch is part of the vector optional dependencies
|
datachain/studio.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from datachain.catalog.catalog import raise_remote_error
|
|
5
|
+
from datachain.config import Config, ConfigLevel
|
|
6
|
+
from datachain.error import DataChainError
|
|
7
|
+
from datachain.remote.studio import StudioClient
|
|
8
|
+
from datachain.utils import STUDIO_URL
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from argparse import Namespace
|
|
12
|
+
|
|
13
|
+
POST_LOGIN_MESSAGE = (
|
|
14
|
+
"Once you've logged in, return here "
|
|
15
|
+
"and you'll be ready to start using DataChain with Studio."
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def process_studio_cli_args(args: "Namespace"):
|
|
20
|
+
if args.cmd == "login":
|
|
21
|
+
return login(args)
|
|
22
|
+
if args.cmd == "logout":
|
|
23
|
+
return logout()
|
|
24
|
+
if args.cmd == "token":
|
|
25
|
+
return token()
|
|
26
|
+
if args.cmd == "datasets":
|
|
27
|
+
return list_datasets(args)
|
|
28
|
+
if args.cmd == "team":
|
|
29
|
+
return set_team(args)
|
|
30
|
+
raise DataChainError(f"Unknown command '{args.cmd}'.")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def set_team(args: "Namespace"):
|
|
34
|
+
level = ConfigLevel.GLOBAL if args.__dict__.get("global") else ConfigLevel.LOCAL
|
|
35
|
+
config = Config(level)
|
|
36
|
+
with config.edit() as conf:
|
|
37
|
+
studio_conf = conf.get("studio", {})
|
|
38
|
+
studio_conf["team"] = args.team_name
|
|
39
|
+
conf["studio"] = studio_conf
|
|
40
|
+
|
|
41
|
+
print(f"Set default team to '{args.team_name}' in {config.config_file()}")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def login(args: "Namespace"):
|
|
45
|
+
from dvc_studio_client.auth import StudioAuthError, get_access_token
|
|
46
|
+
|
|
47
|
+
config = Config().read().get("studio", {})
|
|
48
|
+
name = args.name
|
|
49
|
+
hostname = (
|
|
50
|
+
args.hostname
|
|
51
|
+
or os.environ.get("DVC_STUDIO_URL")
|
|
52
|
+
or config.get("url")
|
|
53
|
+
or STUDIO_URL
|
|
54
|
+
)
|
|
55
|
+
scopes = args.scopes
|
|
56
|
+
|
|
57
|
+
if config.get("url", hostname) == hostname and "token" in config:
|
|
58
|
+
raise DataChainError(
|
|
59
|
+
"Token already exists. "
|
|
60
|
+
"To login with a different token, "
|
|
61
|
+
"logout using `datachain studio logout`."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
open_browser = not args.no_open
|
|
65
|
+
try:
|
|
66
|
+
_, access_token = get_access_token(
|
|
67
|
+
token_name=name,
|
|
68
|
+
hostname=hostname,
|
|
69
|
+
scopes=scopes,
|
|
70
|
+
open_browser=open_browser,
|
|
71
|
+
client_name="DataChain",
|
|
72
|
+
post_login_message=POST_LOGIN_MESSAGE,
|
|
73
|
+
)
|
|
74
|
+
except StudioAuthError as exc:
|
|
75
|
+
raise DataChainError(f"Failed to authenticate with Studio: {exc}") from exc
|
|
76
|
+
|
|
77
|
+
config_path = save_config(hostname, access_token)
|
|
78
|
+
print(f"Authentication complete. Saved token to {config_path}.")
|
|
79
|
+
return 0
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def logout():
|
|
83
|
+
with Config(ConfigLevel.GLOBAL).edit() as conf:
|
|
84
|
+
token = conf.get("studio", {}).get("token")
|
|
85
|
+
if not token:
|
|
86
|
+
raise DataChainError(
|
|
87
|
+
"Not logged in to Studio. Log in with 'datachain studio login'."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
del conf["studio"]["token"]
|
|
91
|
+
|
|
92
|
+
print("Logged out from Studio. (you can log back in with 'datachain studio login')")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def token():
|
|
96
|
+
config = Config().read().get("studio", {})
|
|
97
|
+
token = config.get("token")
|
|
98
|
+
if not token:
|
|
99
|
+
raise DataChainError(
|
|
100
|
+
"Not logged in to Studio. Log in with 'datachain studio login'."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
print(token)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def list_datasets(args: "Namespace"):
|
|
107
|
+
client = StudioClient(team=args.team)
|
|
108
|
+
response = client.ls_datasets()
|
|
109
|
+
if not response.ok:
|
|
110
|
+
raise_remote_error(response.message)
|
|
111
|
+
if not response.data:
|
|
112
|
+
print("No datasets found.")
|
|
113
|
+
return
|
|
114
|
+
for d in response.data:
|
|
115
|
+
name = d.get("name")
|
|
116
|
+
for v in d.get("versions", []):
|
|
117
|
+
version = v.get("version")
|
|
118
|
+
print(f"{name} (v{version})")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def save_config(hostname, token):
|
|
122
|
+
config = Config(ConfigLevel.GLOBAL)
|
|
123
|
+
with config.edit() as conf:
|
|
124
|
+
studio_conf = conf.get("studio", {})
|
|
125
|
+
studio_conf["url"] = hostname
|
|
126
|
+
studio_conf["token"] = token
|
|
127
|
+
conf["studio"] = studio_conf
|
|
128
|
+
|
|
129
|
+
return config.config_file()
|
datachain/utils.py
CHANGED
|
@@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
|
|
15
15
|
from uuid import UUID
|
|
16
16
|
|
|
17
17
|
import cloudpickle
|
|
18
|
+
import platformdirs
|
|
18
19
|
from dateutil import tz
|
|
19
20
|
from dateutil.parser import isoparse
|
|
20
21
|
from pydantic import BaseModel
|
|
@@ -25,6 +26,13 @@ if TYPE_CHECKING:
|
|
|
25
26
|
NUL = b"\0"
|
|
26
27
|
TIME_ZERO = datetime.fromtimestamp(0, tz=timezone.utc)
|
|
27
28
|
|
|
29
|
+
APPNAME = "datachain"
|
|
30
|
+
APPAUTHOR = "iterative"
|
|
31
|
+
ENV_DATACHAIN_SYSTEM_CONFIG_DIR = "DATACHAIN_SYSTEM_CONFIG_DIR"
|
|
32
|
+
ENV_DATACHAIN_GLOBAL_CONFIG_DIR = "DATACHAIN_GLOBAL_CONFIG_DIR"
|
|
33
|
+
STUDIO_URL = "https://studio.dvc.ai"
|
|
34
|
+
|
|
35
|
+
|
|
28
36
|
T = TypeVar("T", bound="DataChainDir")
|
|
29
37
|
|
|
30
38
|
|
|
@@ -33,6 +41,7 @@ class DataChainDir:
|
|
|
33
41
|
CACHE = "cache"
|
|
34
42
|
TMP = "tmp"
|
|
35
43
|
DB = "db"
|
|
44
|
+
CONFIG = "config"
|
|
36
45
|
ENV_VAR = "DATACHAIN_DIR"
|
|
37
46
|
ENV_VAR_DATACHAIN_ROOT = "DATACHAIN_ROOT_DIR"
|
|
38
47
|
|
|
@@ -42,6 +51,7 @@ class DataChainDir:
|
|
|
42
51
|
cache: Optional[str] = None,
|
|
43
52
|
tmp: Optional[str] = None,
|
|
44
53
|
db: Optional[str] = None,
|
|
54
|
+
config: Optional[str] = None,
|
|
45
55
|
) -> None:
|
|
46
56
|
self.root = osp.abspath(root) if root is not None else self.default_root()
|
|
47
57
|
self.cache = (
|
|
@@ -51,12 +61,24 @@ class DataChainDir:
|
|
|
51
61
|
osp.abspath(tmp) if tmp is not None else osp.join(self.root, self.TMP)
|
|
52
62
|
)
|
|
53
63
|
self.db = osp.abspath(db) if db is not None else osp.join(self.root, self.DB)
|
|
64
|
+
self.config = (
|
|
65
|
+
osp.abspath(config)
|
|
66
|
+
if config is not None
|
|
67
|
+
else osp.join(self.root, self.CONFIG)
|
|
68
|
+
)
|
|
69
|
+
self.config = (
|
|
70
|
+
osp.abspath(config)
|
|
71
|
+
if config is not None
|
|
72
|
+
else osp.join(self.root, self.CONFIG)
|
|
73
|
+
)
|
|
54
74
|
|
|
55
75
|
def init(self):
|
|
56
76
|
os.makedirs(self.root, exist_ok=True)
|
|
57
77
|
os.makedirs(self.cache, exist_ok=True)
|
|
58
78
|
os.makedirs(self.tmp, exist_ok=True)
|
|
59
79
|
os.makedirs(osp.split(self.db)[0], exist_ok=True)
|
|
80
|
+
os.makedirs(osp.split(self.config)[0], exist_ok=True)
|
|
81
|
+
os.makedirs(osp.split(self.config)[0], exist_ok=True)
|
|
60
82
|
|
|
61
83
|
@classmethod
|
|
62
84
|
def default_root(cls) -> str:
|
|
@@ -82,6 +104,18 @@ class DataChainDir:
|
|
|
82
104
|
return instance
|
|
83
105
|
|
|
84
106
|
|
|
107
|
+
def system_config_dir():
|
|
108
|
+
return os.getenv(ENV_DATACHAIN_SYSTEM_CONFIG_DIR) or platformdirs.site_config_dir(
|
|
109
|
+
APPNAME, APPAUTHOR
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def global_config_dir():
|
|
114
|
+
return os.getenv(ENV_DATACHAIN_GLOBAL_CONFIG_DIR) or platformdirs.user_config_dir(
|
|
115
|
+
APPNAME, APPAUTHOR
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
85
119
|
def human_time_to_int(time: str) -> Optional[int]:
|
|
86
120
|
if not time:
|
|
87
121
|
return None
|
|
@@ -421,3 +455,27 @@ def env2bool(var, undefined=False):
|
|
|
421
455
|
if var is None:
|
|
422
456
|
return undefined
|
|
423
457
|
return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def nested_dict_path_set(
|
|
461
|
+
data: dict[str, Any], path: Sequence[str], value: Any
|
|
462
|
+
) -> dict[str, Any]:
|
|
463
|
+
"""Sets a value inside a nested dict based on the list of dict keys as a path,
|
|
464
|
+
and will create sub-dicts as needed to set the value."""
|
|
465
|
+
sub_data = data
|
|
466
|
+
for element in path[:-1]:
|
|
467
|
+
if element not in sub_data:
|
|
468
|
+
sub_data[element] = {}
|
|
469
|
+
sub_data = sub_data[element]
|
|
470
|
+
sub_data[path[len(path) - 1]] = value
|
|
471
|
+
return data
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def row_to_nested_dict(
|
|
475
|
+
headers: Iterable[Sequence[str]], row: Iterable[Any]
|
|
476
|
+
) -> dict[str, Any]:
|
|
477
|
+
"""Converts a row to a nested dict based on the provided headers."""
|
|
478
|
+
result: dict[str, Any] = {}
|
|
479
|
+
for h, v in zip(headers, row):
|
|
480
|
+
nested_dict_path_set(result, h, v)
|
|
481
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -19,7 +19,7 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: pyyaml
|
|
20
20
|
Requires-Dist: tomlkit
|
|
21
21
|
Requires-Dist: tqdm
|
|
22
|
-
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: numpy <3,>=1
|
|
23
23
|
Requires-Dist: pandas >=2.0.0
|
|
24
24
|
Requires-Dist: pyarrow
|
|
25
25
|
Requires-Dist: typing-extensions
|
|
@@ -38,15 +38,16 @@ Requires-Dist: orjson >=3.10.5
|
|
|
38
38
|
Requires-Dist: pydantic <3,>=2
|
|
39
39
|
Requires-Dist: jmespath >=1.0
|
|
40
40
|
Requires-Dist: datamodel-code-generator >=0.25
|
|
41
|
-
Requires-Dist: Pillow <
|
|
41
|
+
Requires-Dist: Pillow <12,>=10.0.0
|
|
42
42
|
Requires-Dist: msgpack <2,>=1.0.4
|
|
43
43
|
Requires-Dist: psutil
|
|
44
44
|
Requires-Dist: huggingface-hub
|
|
45
45
|
Requires-Dist: iterative-telemetry >=0.0.9
|
|
46
|
-
Requires-Dist:
|
|
46
|
+
Requires-Dist: platformdirs
|
|
47
|
+
Requires-Dist: dvc-studio-client <1,>=0.21
|
|
47
48
|
Provides-Extra: dev
|
|
48
49
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
49
|
-
Requires-Dist: mypy ==1.
|
|
50
|
+
Requires-Dist: mypy ==1.12.1 ; extra == 'dev'
|
|
50
51
|
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
51
52
|
Requires-Dist: types-pytz ; extra == 'dev'
|
|
52
53
|
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
@@ -63,7 +64,7 @@ Requires-Dist: datachain[tests] ; extra == 'examples'
|
|
|
63
64
|
Requires-Dist: numpy <2,>=1 ; extra == 'examples'
|
|
64
65
|
Requires-Dist: defusedxml ; extra == 'examples'
|
|
65
66
|
Requires-Dist: accelerate ; extra == 'examples'
|
|
66
|
-
Requires-Dist: unstructured[embed-huggingface,pdf] ; extra == 'examples'
|
|
67
|
+
Requires-Dist: unstructured[embed-huggingface,pdf] <0.16.0 ; extra == 'examples'
|
|
67
68
|
Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
|
|
68
69
|
Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
|
|
69
70
|
Requires-Dist: onnx ==1.16.1 ; extra == 'examples'
|