datachain 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/catalog/catalog.py +62 -228
- datachain/cli.py +136 -22
- datachain/client/fsspec.py +9 -0
- datachain/client/local.py +11 -32
- datachain/config.py +126 -51
- datachain/data_storage/schema.py +66 -33
- datachain/data_storage/sqlite.py +12 -4
- datachain/data_storage/warehouse.py +101 -129
- datachain/lib/convert/sql_to_python.py +8 -12
- datachain/lib/dc.py +275 -80
- datachain/lib/func/__init__.py +32 -0
- datachain/lib/func/aggregate.py +353 -0
- datachain/lib/func/func.py +152 -0
- datachain/lib/listing.py +6 -21
- datachain/lib/listing_info.py +4 -0
- datachain/lib/signal_schema.py +17 -8
- datachain/lib/udf.py +3 -3
- datachain/lib/utils.py +5 -0
- datachain/listing.py +22 -48
- datachain/query/__init__.py +1 -2
- datachain/query/batch.py +0 -1
- datachain/query/dataset.py +33 -46
- datachain/query/schema.py +1 -61
- datachain/query/session.py +33 -25
- datachain/remote/studio.py +63 -14
- datachain/sql/functions/__init__.py +1 -1
- datachain/sql/functions/aggregate.py +47 -0
- datachain/sql/functions/array.py +0 -8
- datachain/sql/sqlite/base.py +20 -2
- datachain/studio.py +129 -0
- datachain/utils.py +58 -0
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/METADATA +7 -6
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/RECORD +38 -33
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/WHEEL +1 -1
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/LICENSE +0 -0
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/top_level.txt +0 -0
datachain/cli.py
CHANGED
|
@@ -15,8 +15,8 @@ import shtab
|
|
|
15
15
|
from datachain import Session, utils
|
|
16
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
17
17
|
from datachain.lib.dc import DataChain
|
|
18
|
+
from datachain.studio import process_studio_cli_args
|
|
18
19
|
from datachain.telemetry import telemetry
|
|
19
|
-
from datachain.utils import DataChainDir
|
|
20
20
|
|
|
21
21
|
if TYPE_CHECKING:
|
|
22
22
|
from datachain.catalog import Catalog
|
|
@@ -98,6 +98,134 @@ def add_show_args(parser: ArgumentParser) -> None:
|
|
|
98
98
|
)
|
|
99
99
|
|
|
100
100
|
|
|
101
|
+
def add_studio_parser(subparsers, parent_parser) -> None:
|
|
102
|
+
studio_help = "Commands to authenticate DataChain with Iterative Studio"
|
|
103
|
+
studio_description = (
|
|
104
|
+
"Authenticate DataChain with Studio and set the token. "
|
|
105
|
+
"Once this token has been properly configured,\n"
|
|
106
|
+
"DataChain will utilize it for seamlessly sharing datasets\n"
|
|
107
|
+
"and using Studio features from CLI"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
studio_parser = subparsers.add_parser(
|
|
111
|
+
"studio",
|
|
112
|
+
parents=[parent_parser],
|
|
113
|
+
description=studio_description,
|
|
114
|
+
help=studio_help,
|
|
115
|
+
)
|
|
116
|
+
studio_subparser = studio_parser.add_subparsers(
|
|
117
|
+
dest="cmd",
|
|
118
|
+
help="Use `DataChain studio CMD --help` to display command-specific help.",
|
|
119
|
+
required=True,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
studio_login_help = "Authenticate DataChain with Studio host"
|
|
123
|
+
studio_login_description = (
|
|
124
|
+
"By default, this command authenticates the DataChain with Studio\n"
|
|
125
|
+
"using default scopes and assigns a random name as the token name."
|
|
126
|
+
)
|
|
127
|
+
login_parser = studio_subparser.add_parser(
|
|
128
|
+
"login",
|
|
129
|
+
parents=[parent_parser],
|
|
130
|
+
description=studio_login_description,
|
|
131
|
+
help=studio_login_help,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
login_parser.add_argument(
|
|
135
|
+
"-H",
|
|
136
|
+
"--hostname",
|
|
137
|
+
action="store",
|
|
138
|
+
default=None,
|
|
139
|
+
help="The hostname of the Studio instance to authenticate with.",
|
|
140
|
+
)
|
|
141
|
+
login_parser.add_argument(
|
|
142
|
+
"-s",
|
|
143
|
+
"--scopes",
|
|
144
|
+
action="store",
|
|
145
|
+
default=None,
|
|
146
|
+
help="The scopes for the authentication token. ",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
login_parser.add_argument(
|
|
150
|
+
"-n",
|
|
151
|
+
"--name",
|
|
152
|
+
action="store",
|
|
153
|
+
default=None,
|
|
154
|
+
help="The name of the authentication token. It will be used to\n"
|
|
155
|
+
"identify token shown in Studio profile.",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
login_parser.add_argument(
|
|
159
|
+
"--no-open",
|
|
160
|
+
action="store_true",
|
|
161
|
+
default=False,
|
|
162
|
+
help="Use authentication flow based on user code.\n"
|
|
163
|
+
"You will be presented with user code to enter in browser.\n"
|
|
164
|
+
"DataChain will also use this if it cannot launch browser on your behalf.",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
studio_logout_help = "Logout user from Studio"
|
|
168
|
+
studio_logout_description = "This removes the studio token from your global config."
|
|
169
|
+
|
|
170
|
+
studio_subparser.add_parser(
|
|
171
|
+
"logout",
|
|
172
|
+
parents=[parent_parser],
|
|
173
|
+
description=studio_logout_description,
|
|
174
|
+
help=studio_logout_help,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
studio_team_help = "Set the default team for DataChain"
|
|
178
|
+
studio_team_description = (
|
|
179
|
+
"Set the default team for DataChain to use when interacting with Studio."
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
team_parser = studio_subparser.add_parser(
|
|
183
|
+
"team",
|
|
184
|
+
parents=[parent_parser],
|
|
185
|
+
description=studio_team_description,
|
|
186
|
+
help=studio_team_help,
|
|
187
|
+
)
|
|
188
|
+
team_parser.add_argument(
|
|
189
|
+
"team_name",
|
|
190
|
+
action="store",
|
|
191
|
+
help="The name of the team to set as the default.",
|
|
192
|
+
)
|
|
193
|
+
team_parser.add_argument(
|
|
194
|
+
"--global",
|
|
195
|
+
action="store_true",
|
|
196
|
+
default=False,
|
|
197
|
+
help="Set the team globally for all DataChain projects.",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
studio_token_help = "View the token datachain uses to contact Studio" # noqa: S105 # nosec B105
|
|
201
|
+
|
|
202
|
+
studio_subparser.add_parser(
|
|
203
|
+
"token",
|
|
204
|
+
parents=[parent_parser],
|
|
205
|
+
description=studio_token_help,
|
|
206
|
+
help=studio_token_help,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
studio_ls_dataset_help = "List the available datasets from Studio"
|
|
210
|
+
studio_ls_dataset_description = (
|
|
211
|
+
"This command lists all the datasets available in Studio.\n"
|
|
212
|
+
"It will show the dataset name and the number of versions available."
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
ls_dataset_parser = studio_subparser.add_parser(
|
|
216
|
+
"datasets",
|
|
217
|
+
parents=[parent_parser],
|
|
218
|
+
description=studio_ls_dataset_description,
|
|
219
|
+
help=studio_ls_dataset_help,
|
|
220
|
+
)
|
|
221
|
+
ls_dataset_parser.add_argument(
|
|
222
|
+
"--team",
|
|
223
|
+
action="store",
|
|
224
|
+
default=None,
|
|
225
|
+
help="The team to list datasets for. By default, it will use team from config.",
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
101
229
|
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
102
230
|
try:
|
|
103
231
|
__version__ = version("datachain")
|
|
@@ -121,12 +249,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
121
249
|
action="store_true",
|
|
122
250
|
help="AWS anon (aka awscli's --no-sign-request)",
|
|
123
251
|
)
|
|
124
|
-
parent_parser.add_argument(
|
|
125
|
-
"--ttl",
|
|
126
|
-
type=human_time_type,
|
|
127
|
-
default=TTL_HUMAN,
|
|
128
|
-
help="Time-to-live of data source cache. Negative equals forever.",
|
|
129
|
-
)
|
|
130
252
|
parent_parser.add_argument(
|
|
131
253
|
"-u", "--update", action="count", default=0, help="Update cache"
|
|
132
254
|
)
|
|
@@ -226,6 +348,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
226
348
|
help="Use a different filename for the resulting .edatachain file",
|
|
227
349
|
)
|
|
228
350
|
|
|
351
|
+
add_studio_parser(subp, parent_parser)
|
|
352
|
+
|
|
229
353
|
parse_pull = subp.add_parser(
|
|
230
354
|
"pull",
|
|
231
355
|
parents=[parent_parser],
|
|
@@ -638,16 +762,13 @@ def format_ls_entry(entry: str) -> str:
|
|
|
638
762
|
|
|
639
763
|
|
|
640
764
|
def ls_remote(
|
|
641
|
-
url: str,
|
|
642
|
-
username: str,
|
|
643
|
-
token: str,
|
|
644
765
|
paths: Iterable[str],
|
|
645
766
|
long: bool = False,
|
|
646
767
|
):
|
|
647
768
|
from datachain.node import long_line_str
|
|
648
769
|
from datachain.remote.studio import StudioClient
|
|
649
770
|
|
|
650
|
-
client = StudioClient(
|
|
771
|
+
client = StudioClient()
|
|
651
772
|
first = True
|
|
652
773
|
for path, response in client.ls(paths):
|
|
653
774
|
if not first:
|
|
@@ -679,17 +800,14 @@ def ls(
|
|
|
679
800
|
**kwargs,
|
|
680
801
|
):
|
|
681
802
|
if config is None:
|
|
682
|
-
from .config import
|
|
803
|
+
from .config import Config
|
|
683
804
|
|
|
684
|
-
config =
|
|
805
|
+
config = Config().get_remote_config(remote=remote)
|
|
685
806
|
remote_type = config["type"]
|
|
686
807
|
if remote_type == "local":
|
|
687
808
|
ls_local(sources, long=long, **kwargs)
|
|
688
809
|
else:
|
|
689
810
|
ls_remote(
|
|
690
|
-
config["url"],
|
|
691
|
-
config["username"],
|
|
692
|
-
config["token"],
|
|
693
811
|
sources,
|
|
694
812
|
long=long,
|
|
695
813
|
)
|
|
@@ -887,7 +1005,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
887
1005
|
edatachain_only=False,
|
|
888
1006
|
no_edatachain_file=True,
|
|
889
1007
|
no_glob=args.no_glob,
|
|
890
|
-
ttl=args.ttl,
|
|
891
1008
|
)
|
|
892
1009
|
elif args.command == "clone":
|
|
893
1010
|
catalog.clone(
|
|
@@ -897,7 +1014,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
897
1014
|
update=bool(args.update),
|
|
898
1015
|
recursive=bool(args.recursive),
|
|
899
1016
|
no_glob=args.no_glob,
|
|
900
|
-
ttl=args.ttl,
|
|
901
1017
|
no_cp=args.no_cp,
|
|
902
1018
|
edatachain=args.edatachain,
|
|
903
1019
|
edatachain_file=args.edatachain_file,
|
|
@@ -923,7 +1039,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
923
1039
|
args.sources,
|
|
924
1040
|
long=bool(args.long),
|
|
925
1041
|
remote=args.remote,
|
|
926
|
-
ttl=args.ttl,
|
|
927
1042
|
update=bool(args.update),
|
|
928
1043
|
client_config=client_config,
|
|
929
1044
|
)
|
|
@@ -957,7 +1072,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
957
1072
|
show_bytes=args.bytes,
|
|
958
1073
|
depth=args.depth,
|
|
959
1074
|
si=args.si,
|
|
960
|
-
ttl=args.ttl,
|
|
961
1075
|
update=bool(args.update),
|
|
962
1076
|
client_config=client_config,
|
|
963
1077
|
)
|
|
@@ -965,7 +1079,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
965
1079
|
results_found = False
|
|
966
1080
|
for result in catalog.find(
|
|
967
1081
|
args.sources,
|
|
968
|
-
ttl=args.ttl,
|
|
969
1082
|
update=bool(args.update),
|
|
970
1083
|
names=args.name,
|
|
971
1084
|
inames=args.iname,
|
|
@@ -983,7 +1096,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
983
1096
|
index(
|
|
984
1097
|
catalog,
|
|
985
1098
|
args.sources,
|
|
986
|
-
ttl=args.ttl,
|
|
987
1099
|
update=bool(args.update),
|
|
988
1100
|
)
|
|
989
1101
|
elif args.command == "completion":
|
|
@@ -1001,6 +1113,8 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1001
1113
|
clear_cache(catalog)
|
|
1002
1114
|
elif args.command == "gc":
|
|
1003
1115
|
garbage_collect(catalog)
|
|
1116
|
+
elif args.command == "studio":
|
|
1117
|
+
process_studio_cli_args(args)
|
|
1004
1118
|
else:
|
|
1005
1119
|
print(f"invalid command: {args.command}", file=sys.stderr)
|
|
1006
1120
|
return 1
|
datachain/client/fsspec.py
CHANGED
|
@@ -124,6 +124,9 @@ class Client(ABC):
|
|
|
124
124
|
def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
|
|
125
125
|
cls = Client.get_implementation(source)
|
|
126
126
|
storage_url, _ = cls.split_url(source)
|
|
127
|
+
if os.name == "nt":
|
|
128
|
+
storage_url = storage_url.removeprefix("/")
|
|
129
|
+
|
|
127
130
|
return cls.from_name(storage_url, cache, kwargs)
|
|
128
131
|
|
|
129
132
|
@classmethod
|
|
@@ -171,6 +174,12 @@ class Client(ABC):
|
|
|
171
174
|
|
|
172
175
|
@classmethod
|
|
173
176
|
def split_url(cls, url: str) -> tuple[str, str]:
|
|
177
|
+
"""
|
|
178
|
+
Splits the URL into two pieces:
|
|
179
|
+
1. bucket name without protocol (everything up until the first /)
|
|
180
|
+
2. path which is the rest of URL starting from bucket name
|
|
181
|
+
e.g s3://my-bucket/animals/dogs -> (my-bucket, animals/dogs)
|
|
182
|
+
"""
|
|
174
183
|
fill_path = url[len(cls.PREFIX) :]
|
|
175
184
|
path_split = fill_path.split("/", 1)
|
|
176
185
|
bucket = path_split[0]
|
datachain/client/local.py
CHANGED
|
@@ -29,25 +29,7 @@ class FileClient(Client):
|
|
|
29
29
|
|
|
30
30
|
@classmethod
|
|
31
31
|
def get_uri(cls, name) -> StorageURI:
|
|
32
|
-
""
|
|
33
|
-
This returns root of FS as uri, e.g
|
|
34
|
-
Linux & Mac : file:///
|
|
35
|
-
Windows: file:///C:/
|
|
36
|
-
"""
|
|
37
|
-
return StorageURI(Path(name).as_uri())
|
|
38
|
-
|
|
39
|
-
@staticmethod
|
|
40
|
-
def root_dir() -> str:
|
|
41
|
-
"""
|
|
42
|
-
Returns file system root path.
|
|
43
|
-
Linux & MacOS: /
|
|
44
|
-
Windows: C:/
|
|
45
|
-
"""
|
|
46
|
-
return Path.cwd().anchor.replace(os.sep, posixpath.sep)
|
|
47
|
-
|
|
48
|
-
@staticmethod
|
|
49
|
-
def root_path() -> Path:
|
|
50
|
-
return Path(FileClient.root_dir())
|
|
32
|
+
return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
|
|
51
33
|
|
|
52
34
|
@classmethod
|
|
53
35
|
def ls_buckets(cls, **kwargs):
|
|
@@ -75,23 +57,20 @@ class FileClient(Client):
|
|
|
75
57
|
|
|
76
58
|
@classmethod
|
|
77
59
|
def split_url(cls, url: str) -> tuple[str, str]:
|
|
78
|
-
"""
|
|
79
|
-
Splits url into two components:
|
|
80
|
-
1. root of the FS which will later on become the name of the storage
|
|
81
|
-
2. path which will later on become partial path
|
|
82
|
-
Note that URL needs to be have file:/// protocol.
|
|
83
|
-
Examples:
|
|
84
|
-
file:///tmp/dir -> / + tmp/dir
|
|
85
|
-
file:///c:/windows/files -> c:/ + windows/files
|
|
86
|
-
"""
|
|
87
60
|
parsed = urlparse(url)
|
|
88
61
|
if parsed.scheme == "file":
|
|
89
62
|
scheme, rest = url.split(":", 1)
|
|
90
|
-
|
|
63
|
+
url = f"{scheme.lower()}:{rest}"
|
|
91
64
|
else:
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
65
|
+
url = cls.path_to_uri(url)
|
|
66
|
+
|
|
67
|
+
fill_path = url[len(cls.PREFIX) :]
|
|
68
|
+
path_split = fill_path.rsplit("/", 1)
|
|
69
|
+
bucket = path_split[0]
|
|
70
|
+
if os.name == "nt":
|
|
71
|
+
bucket = bucket.removeprefix("/")
|
|
72
|
+
path = path_split[1] if len(path_split) > 1 else ""
|
|
73
|
+
return bucket, path
|
|
95
74
|
|
|
96
75
|
@classmethod
|
|
97
76
|
def from_name(cls, name: str, cache, kwargs) -> "FileClient":
|
datachain/config.py
CHANGED
|
@@ -1,62 +1,137 @@
|
|
|
1
|
-
import os
|
|
2
1
|
from collections.abc import Mapping
|
|
3
|
-
from
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Optional, Union
|
|
4
5
|
|
|
5
|
-
from tomlkit import load
|
|
6
|
+
from tomlkit import TOMLDocument, dump, load
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
from tomlkit import TOMLDocument
|
|
8
|
+
from datachain.utils import DataChainDir, global_config_dir, system_config_dir
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
except FileNotFoundError:
|
|
17
|
-
return None
|
|
11
|
+
# Define an enum with value system, global and local
|
|
12
|
+
class ConfigLevel(Enum):
|
|
13
|
+
SYSTEM = "system"
|
|
14
|
+
GLOBAL = "global"
|
|
15
|
+
LOCAL = "local"
|
|
18
16
|
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
18
|
+
class Config:
|
|
19
|
+
SYSTEM_LEVELS = (ConfigLevel.SYSTEM, ConfigLevel.GLOBAL)
|
|
20
|
+
LOCAL_LEVELS = (ConfigLevel.LOCAL,)
|
|
21
|
+
|
|
22
|
+
# In the order of precedence
|
|
23
|
+
LEVELS = SYSTEM_LEVELS + LOCAL_LEVELS
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
level: Optional[ConfigLevel] = None,
|
|
28
|
+
):
|
|
29
|
+
self.level = level
|
|
30
|
+
|
|
31
|
+
self.init()
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def get_dir(cls, level: Optional[ConfigLevel]) -> str:
|
|
35
|
+
if level == ConfigLevel.SYSTEM:
|
|
36
|
+
return system_config_dir()
|
|
37
|
+
if level == ConfigLevel.GLOBAL:
|
|
38
|
+
return global_config_dir()
|
|
39
|
+
|
|
40
|
+
return str(DataChainDir.find().root)
|
|
41
|
+
|
|
42
|
+
def init(self):
|
|
43
|
+
d = DataChainDir(self.get_dir(self.level))
|
|
44
|
+
d.init()
|
|
45
|
+
|
|
46
|
+
def load_one(self, level: Optional[ConfigLevel] = None) -> TOMLDocument:
|
|
47
|
+
config_path = DataChainDir(self.get_dir(level)).config
|
|
48
|
+
|
|
26
49
|
try:
|
|
27
|
-
|
|
28
|
-
|
|
50
|
+
with open(config_path, encoding="utf-8") as f:
|
|
51
|
+
return load(f)
|
|
52
|
+
except FileNotFoundError:
|
|
53
|
+
return TOMLDocument()
|
|
54
|
+
|
|
55
|
+
def load_config_to_level(self) -> TOMLDocument:
|
|
56
|
+
merged_conf = TOMLDocument()
|
|
57
|
+
|
|
58
|
+
for merge_level in self.LEVELS:
|
|
59
|
+
if merge_level == self.level:
|
|
60
|
+
break
|
|
61
|
+
config = self.load_one(merge_level)
|
|
62
|
+
if config:
|
|
63
|
+
merge(merged_conf, config)
|
|
64
|
+
|
|
65
|
+
return merged_conf
|
|
66
|
+
|
|
67
|
+
def read(self) -> TOMLDocument:
|
|
68
|
+
if self.level is None:
|
|
69
|
+
return self.load_config_to_level()
|
|
70
|
+
return self.load_one(self.level)
|
|
71
|
+
|
|
72
|
+
@contextmanager
|
|
73
|
+
def edit(self):
|
|
74
|
+
config = self.load_one(self.level)
|
|
75
|
+
yield config
|
|
76
|
+
|
|
77
|
+
self.write(config)
|
|
78
|
+
|
|
79
|
+
def config_file(self):
|
|
80
|
+
return DataChainDir(self.get_dir(self.level)).config
|
|
81
|
+
|
|
82
|
+
def write(self, config: TOMLDocument):
|
|
83
|
+
with open(self.config_file(), "w") as f:
|
|
84
|
+
dump(config, f)
|
|
85
|
+
|
|
86
|
+
def get_remote_config(self, remote: str = "") -> Mapping[str, str]:
|
|
87
|
+
config = self.read()
|
|
88
|
+
|
|
89
|
+
if not config:
|
|
29
90
|
return {"type": "local"}
|
|
30
|
-
|
|
31
|
-
remote_conf: Mapping[str, str] = config["remote"][remote] # type: ignore[assignment,index]
|
|
32
|
-
except KeyError:
|
|
33
|
-
raise Exception(
|
|
34
|
-
f"missing config section for default remote: remote.{remote}"
|
|
35
|
-
) from None
|
|
36
|
-
except Exception as exc:
|
|
37
|
-
raise Exception("invalid config") from exc
|
|
38
|
-
|
|
39
|
-
if not isinstance(remote_conf, Mapping):
|
|
40
|
-
raise TypeError(f"config section remote.{remote} must be a mapping")
|
|
41
|
-
|
|
42
|
-
remote_type = remote_conf.get("type")
|
|
43
|
-
if remote_type not in ("local", "http"):
|
|
44
|
-
raise Exception(
|
|
45
|
-
f'config section remote.{remote} must have "type" with one of: '
|
|
46
|
-
'"local", "http"'
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
if remote_type == "http":
|
|
50
|
-
for key in ["url", "username", "token"]:
|
|
91
|
+
if not remote:
|
|
51
92
|
try:
|
|
52
|
-
|
|
93
|
+
remote = config["core"]["default-remote"] # type: ignore[index,assignment]
|
|
53
94
|
except KeyError:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
95
|
+
return {"type": "local"}
|
|
96
|
+
try:
|
|
97
|
+
remote_conf: Mapping[str, str] = config["remote"][remote] # type: ignore[assignment,index]
|
|
98
|
+
except KeyError:
|
|
99
|
+
raise Exception(
|
|
100
|
+
f"missing config section for default remote: remote.{remote}"
|
|
101
|
+
) from None
|
|
102
|
+
except Exception as exc:
|
|
103
|
+
raise Exception("invalid config") from exc
|
|
104
|
+
|
|
105
|
+
if not isinstance(remote_conf, Mapping):
|
|
106
|
+
raise TypeError(f"config section remote.{remote} must be a mapping")
|
|
107
|
+
|
|
108
|
+
remote_type = remote_conf.get("type")
|
|
109
|
+
if remote_type not in ("local", "http"):
|
|
110
|
+
raise Exception(
|
|
111
|
+
f'config section remote.{remote} must have "type" with one of: '
|
|
112
|
+
'"local", "http"'
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if remote_type == "http":
|
|
116
|
+
for key in ["url", "username", "token"]:
|
|
117
|
+
try:
|
|
118
|
+
remote_conf[key]
|
|
119
|
+
except KeyError:
|
|
120
|
+
raise Exception(
|
|
121
|
+
f"config section remote.{remote} of type {remote_type} "
|
|
122
|
+
f"must contain key {key}"
|
|
123
|
+
) from None
|
|
124
|
+
elif remote_type != "local":
|
|
125
|
+
raise Exception(
|
|
126
|
+
f"config section remote.{remote} has invalid remote type {remote_type}"
|
|
127
|
+
)
|
|
128
|
+
return remote_conf
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def merge(into: Union[TOMLDocument, dict], update: Union[TOMLDocument, dict]):
|
|
132
|
+
"""Merges second dict into first recursively"""
|
|
133
|
+
for key, val in update.items():
|
|
134
|
+
if isinstance(into.get(key), dict) and isinstance(val, dict):
|
|
135
|
+
merge(into[key], val) # type: ignore[arg-type]
|
|
136
|
+
else:
|
|
137
|
+
into[key] = val
|
datachain/data_storage/schema.py
CHANGED
|
@@ -26,6 +26,13 @@ if TYPE_CHECKING:
|
|
|
26
26
|
from sqlalchemy.sql.elements import ColumnElement
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
DEFAULT_DELIMITER = "__"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def col_name(name: str, object_name: str = "file") -> str:
|
|
33
|
+
return f"{object_name}{DEFAULT_DELIMITER}{name}"
|
|
34
|
+
|
|
35
|
+
|
|
29
36
|
def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
30
37
|
"""
|
|
31
38
|
Removes duplicate columns from a list of columns.
|
|
@@ -76,64 +83,81 @@ def convert_rows_custom_column_types(
|
|
|
76
83
|
|
|
77
84
|
|
|
78
85
|
class DirExpansion:
|
|
79
|
-
|
|
80
|
-
|
|
86
|
+
def __init__(self, object_name: str):
|
|
87
|
+
self.object_name = object_name
|
|
88
|
+
|
|
89
|
+
def col_name(self, name: str, object_name: Optional[str] = None) -> str:
|
|
90
|
+
object_name = object_name or self.object_name
|
|
91
|
+
return col_name(name, object_name)
|
|
92
|
+
|
|
93
|
+
def c(self, query, name: str, object_name: Optional[str] = None) -> str:
|
|
94
|
+
return getattr(query.c, self.col_name(name, object_name=object_name))
|
|
95
|
+
|
|
96
|
+
def base_select(self, q):
|
|
81
97
|
return sa.select(
|
|
82
|
-
|
|
83
|
-
false().label("is_dir"),
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
98
|
+
self.c(q, "id", object_name="sys"),
|
|
99
|
+
false().label(self.col_name("is_dir")),
|
|
100
|
+
self.c(q, "source"),
|
|
101
|
+
self.c(q, "path"),
|
|
102
|
+
self.c(q, "version"),
|
|
103
|
+
self.c(q, "location"),
|
|
88
104
|
)
|
|
89
105
|
|
|
90
|
-
|
|
91
|
-
def apply_group_by(q):
|
|
106
|
+
def apply_group_by(self, q):
|
|
92
107
|
return (
|
|
93
108
|
sa.select(
|
|
94
109
|
f.min(q.c.sys__id).label("sys__id"),
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
f.max(
|
|
110
|
+
self.c(q, "is_dir"),
|
|
111
|
+
self.c(q, "source"),
|
|
112
|
+
self.c(q, "path"),
|
|
113
|
+
self.c(q, "version"),
|
|
114
|
+
f.max(self.c(q, "location")).label(self.col_name("location")),
|
|
100
115
|
)
|
|
101
116
|
.select_from(q)
|
|
102
|
-
.group_by(
|
|
103
|
-
|
|
117
|
+
.group_by(
|
|
118
|
+
self.c(q, "source"),
|
|
119
|
+
self.c(q, "path"),
|
|
120
|
+
self.c(q, "is_dir"),
|
|
121
|
+
self.c(q, "version"),
|
|
122
|
+
)
|
|
123
|
+
.order_by(
|
|
124
|
+
self.c(q, "source"),
|
|
125
|
+
self.c(q, "path"),
|
|
126
|
+
self.c(q, "is_dir"),
|
|
127
|
+
self.c(q, "version"),
|
|
128
|
+
)
|
|
104
129
|
)
|
|
105
130
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
parent = path.parent(q.c.path)
|
|
131
|
+
def query(self, q):
|
|
132
|
+
q = self.base_select(q).cte(recursive=True)
|
|
133
|
+
parent = path.parent(self.c(q, "path"))
|
|
110
134
|
q = q.union_all(
|
|
111
135
|
sa.select(
|
|
112
136
|
sa.literal(-1).label("sys__id"),
|
|
113
|
-
true().label("is_dir"),
|
|
114
|
-
|
|
115
|
-
parent.label("path"),
|
|
116
|
-
sa.literal("").label("version"),
|
|
117
|
-
null().label("location"),
|
|
137
|
+
true().label(self.col_name("is_dir")),
|
|
138
|
+
self.c(q, "source"),
|
|
139
|
+
parent.label(self.col_name("path")),
|
|
140
|
+
sa.literal("").label(self.col_name("version")),
|
|
141
|
+
null().label(self.col_name("location")),
|
|
118
142
|
).where(parent != "")
|
|
119
143
|
)
|
|
120
|
-
return
|
|
144
|
+
return self.apply_group_by(q)
|
|
121
145
|
|
|
122
146
|
|
|
123
147
|
class DataTable:
|
|
124
|
-
dataset_dir_expansion = staticmethod(DirExpansion.query)
|
|
125
|
-
|
|
126
148
|
def __init__(
|
|
127
149
|
self,
|
|
128
150
|
name: str,
|
|
129
151
|
engine: "Engine",
|
|
130
152
|
metadata: Optional["sa.MetaData"] = None,
|
|
131
153
|
column_types: Optional[dict[str, SQLType]] = None,
|
|
154
|
+
object_name: str = "file",
|
|
132
155
|
):
|
|
133
156
|
self.name: str = name
|
|
134
157
|
self.engine = engine
|
|
135
158
|
self.metadata: sa.MetaData = metadata if metadata is not None else sa.MetaData()
|
|
136
159
|
self.column_types: dict[str, SQLType] = column_types or {}
|
|
160
|
+
self.object_name = object_name
|
|
137
161
|
|
|
138
162
|
@staticmethod
|
|
139
163
|
def copy_column(
|
|
@@ -204,9 +228,18 @@ class DataTable:
|
|
|
204
228
|
def columns(self) -> "ReadOnlyColumnCollection[str, sa.Column[Any]]":
|
|
205
229
|
return self.table.columns
|
|
206
230
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
return
|
|
231
|
+
def col_name(self, name: str, object_name: Optional[str] = None) -> str:
|
|
232
|
+
object_name = object_name or self.object_name
|
|
233
|
+
return col_name(name, object_name)
|
|
234
|
+
|
|
235
|
+
def without_object(
|
|
236
|
+
self, column_name: str, object_name: Optional[str] = None
|
|
237
|
+
) -> str:
|
|
238
|
+
object_name = object_name or self.object_name
|
|
239
|
+
return column_name.removeprefix(f"{object_name}{DEFAULT_DELIMITER}")
|
|
240
|
+
|
|
241
|
+
def c(self, name: str, object_name: Optional[str] = None):
|
|
242
|
+
return getattr(self.columns, self.col_name(name, object_name=object_name))
|
|
210
243
|
|
|
211
244
|
@property
|
|
212
245
|
def table(self) -> "sa.Table":
|
|
@@ -246,7 +279,7 @@ class DataTable:
|
|
|
246
279
|
]
|
|
247
280
|
|
|
248
281
|
def dir_expansion(self):
|
|
249
|
-
return self.
|
|
282
|
+
return DirExpansion(self.object_name)
|
|
250
283
|
|
|
251
284
|
|
|
252
285
|
PARTITION_COLUMN_ID = "partition_id"
|