datachain 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +4 -2
- datachain/catalog/catalog.py +100 -54
- datachain/catalog/datasource.py +4 -6
- datachain/cli/__init__.py +311 -0
- datachain/cli/commands/__init__.py +29 -0
- datachain/cli/commands/datasets.py +129 -0
- datachain/cli/commands/du.py +14 -0
- datachain/cli/commands/index.py +12 -0
- datachain/cli/commands/ls.py +169 -0
- datachain/cli/commands/misc.py +28 -0
- datachain/cli/commands/query.py +53 -0
- datachain/cli/commands/show.py +38 -0
- datachain/cli/parser/__init__.py +547 -0
- datachain/cli/parser/job.py +120 -0
- datachain/cli/parser/studio.py +126 -0
- datachain/cli/parser/utils.py +63 -0
- datachain/{cli_utils.py → cli/utils.py} +27 -1
- datachain/client/azure.py +21 -1
- datachain/client/fsspec.py +45 -13
- datachain/client/gcs.py +10 -2
- datachain/client/local.py +4 -4
- datachain/client/s3.py +10 -0
- datachain/dataset.py +1 -0
- datachain/func/__init__.py +2 -2
- datachain/func/conditional.py +52 -0
- datachain/func/func.py +5 -1
- datachain/lib/arrow.py +4 -0
- datachain/lib/dc.py +18 -3
- datachain/lib/file.py +1 -1
- datachain/lib/listing.py +36 -3
- datachain/lib/signal_schema.py +89 -27
- datachain/listing.py +1 -5
- datachain/node.py +27 -1
- datachain/progress.py +2 -2
- datachain/query/session.py +1 -1
- datachain/studio.py +58 -38
- datachain/utils.py +1 -1
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/METADATA +6 -6
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/RECORD +43 -31
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/WHEEL +1 -1
- datachain/cli.py +0 -1475
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/LICENSE +0 -0
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/top_level.txt +0 -0
datachain/lib/listing.py
CHANGED
|
@@ -39,6 +39,15 @@ def list_bucket(uri: str, cache, client_config=None) -> Callable:
|
|
|
39
39
|
return list_func
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def get_file_info(uri: str, cache, client_config=None) -> File:
|
|
43
|
+
"""
|
|
44
|
+
Wrapper to return File object by its URI
|
|
45
|
+
"""
|
|
46
|
+
client = Client.get_client(uri, cache, **(client_config or {})) # type: ignore[arg-type]
|
|
47
|
+
_, path = Client.parse_url(uri)
|
|
48
|
+
return client.get_file_info(path)
|
|
49
|
+
|
|
50
|
+
|
|
42
51
|
def ls(
|
|
43
52
|
dc: D,
|
|
44
53
|
path: str,
|
|
@@ -76,7 +85,25 @@ def ls(
|
|
|
76
85
|
return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
|
|
77
86
|
|
|
78
87
|
|
|
79
|
-
def
|
|
88
|
+
def _isfile(client: "Client", path: str) -> bool:
|
|
89
|
+
"""
|
|
90
|
+
Returns True if uri points to a file
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
info = client.fs.info(path)
|
|
94
|
+
name = info.get("name")
|
|
95
|
+
# case for special simulated directories on some clouds
|
|
96
|
+
# e.g. Google creates a zero byte file with the same name as the
|
|
97
|
+
# directory with a trailing slash at the end
|
|
98
|
+
if not name or name.endswith("/"):
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
return info["type"] == "file"
|
|
102
|
+
except: # noqa: E722
|
|
103
|
+
return False
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], str, str]:
|
|
80
107
|
"""
|
|
81
108
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
82
109
|
"""
|
|
@@ -85,7 +112,9 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
|
|
|
85
112
|
storage_uri, path = Client.parse_url(uri)
|
|
86
113
|
telemetry.log_param("client", client.PREFIX)
|
|
87
114
|
|
|
88
|
-
if
|
|
115
|
+
if not uri.endswith("/") and _isfile(client, uri):
|
|
116
|
+
return None, f'{storage_uri}/{path.lstrip("/")}', path
|
|
117
|
+
if uses_glob(path):
|
|
89
118
|
lst_uri_path = posixpath.dirname(path)
|
|
90
119
|
else:
|
|
91
120
|
storage_uri, path = Client.parse_url(f'{uri.rstrip("/")}/')
|
|
@@ -113,7 +142,7 @@ def listing_uri_from_name(dataset_name: str) -> str:
|
|
|
113
142
|
|
|
114
143
|
def get_listing(
|
|
115
144
|
uri: str, session: "Session", update: bool = False
|
|
116
|
-
) -> tuple[str, str, str, bool]:
|
|
145
|
+
) -> tuple[Optional[str], str, str, bool]:
|
|
117
146
|
"""Returns correct listing dataset name that must be used for saving listing
|
|
118
147
|
operation. It takes into account existing listings and reusability of those.
|
|
119
148
|
It also returns boolean saying if returned dataset name is reused / already
|
|
@@ -131,6 +160,10 @@ def get_listing(
|
|
|
131
160
|
ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
|
|
132
161
|
listing = None
|
|
133
162
|
|
|
163
|
+
# if we don't want to use cached dataset (e.g. for a single file listing)
|
|
164
|
+
if not ds_name:
|
|
165
|
+
return None, list_uri, list_path, False
|
|
166
|
+
|
|
134
167
|
listings = [
|
|
135
168
|
ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
|
|
136
169
|
]
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -13,13 +13,14 @@ from typing import ( # noqa: UP035
|
|
|
13
13
|
Final,
|
|
14
14
|
List,
|
|
15
15
|
Literal,
|
|
16
|
+
Mapping,
|
|
16
17
|
Optional,
|
|
17
18
|
Union,
|
|
18
19
|
get_args,
|
|
19
20
|
get_origin,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
|
-
from pydantic import BaseModel, create_model
|
|
23
|
+
from pydantic import BaseModel, Field, create_model
|
|
23
24
|
from sqlalchemy import ColumnElement
|
|
24
25
|
from typing_extensions import Literal as LiteralEx
|
|
25
26
|
|
|
@@ -85,8 +86,31 @@ class SignalResolvingTypeError(SignalResolvingError):
|
|
|
85
86
|
)
|
|
86
87
|
|
|
87
88
|
|
|
89
|
+
class CustomType(BaseModel):
|
|
90
|
+
schema_version: int = Field(ge=1, le=2, strict=True)
|
|
91
|
+
name: str
|
|
92
|
+
fields: dict[str, str]
|
|
93
|
+
bases: list[tuple[str, str, Optional[str]]]
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def deserialize(cls, data: dict[str, Any], type_name: str) -> "CustomType":
|
|
97
|
+
version = data.get("schema_version", 1)
|
|
98
|
+
|
|
99
|
+
if version == 1:
|
|
100
|
+
data = {
|
|
101
|
+
"schema_version": 1,
|
|
102
|
+
"name": type_name,
|
|
103
|
+
"fields": data,
|
|
104
|
+
"bases": [],
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return cls(**data)
|
|
108
|
+
|
|
109
|
+
|
|
88
110
|
def create_feature_model(
|
|
89
|
-
name: str,
|
|
111
|
+
name: str,
|
|
112
|
+
fields: Mapping[str, Union[type, None, tuple[type, Any]]],
|
|
113
|
+
base: Optional[type] = None,
|
|
90
114
|
) -> type[BaseModel]:
|
|
91
115
|
"""
|
|
92
116
|
This gets or returns a dynamic feature model for use in restoring a model
|
|
@@ -98,7 +122,7 @@ def create_feature_model(
|
|
|
98
122
|
name = name.replace("@", "_")
|
|
99
123
|
return create_model(
|
|
100
124
|
name,
|
|
101
|
-
__base__=DataModel, # type: ignore[call-overload]
|
|
125
|
+
__base__=base or DataModel, # type: ignore[call-overload]
|
|
102
126
|
# These are tuples for each field of: annotation, default (if any)
|
|
103
127
|
**{
|
|
104
128
|
field_name: anno if isinstance(anno, tuple) else (anno, None)
|
|
@@ -156,7 +180,7 @@ class SignalSchema:
|
|
|
156
180
|
return SignalSchema(signals)
|
|
157
181
|
|
|
158
182
|
@staticmethod
|
|
159
|
-
def
|
|
183
|
+
def _serialize_custom_model(
|
|
160
184
|
version_name: str, fr: type[BaseModel], custom_types: dict[str, Any]
|
|
161
185
|
) -> str:
|
|
162
186
|
"""This serializes any custom type information to the provided custom_types
|
|
@@ -165,12 +189,23 @@ class SignalSchema:
|
|
|
165
189
|
# This type is already stored in custom_types.
|
|
166
190
|
return version_name
|
|
167
191
|
fields = {}
|
|
192
|
+
|
|
168
193
|
for field_name, info in fr.model_fields.items():
|
|
169
194
|
field_type = info.annotation
|
|
170
195
|
# All fields should be typed.
|
|
171
196
|
assert field_type
|
|
172
197
|
fields[field_name] = SignalSchema._serialize_type(field_type, custom_types)
|
|
173
|
-
|
|
198
|
+
|
|
199
|
+
bases: list[tuple[str, str, Optional[str]]] = []
|
|
200
|
+
for type_ in fr.__mro__:
|
|
201
|
+
model_store_name = (
|
|
202
|
+
ModelStore.get_name(type_) if issubclass(type_, DataModel) else None
|
|
203
|
+
)
|
|
204
|
+
bases.append((type_.__name__, type_.__module__, model_store_name))
|
|
205
|
+
|
|
206
|
+
ct = CustomType(schema_version=2, name=version_name, fields=fields, bases=bases)
|
|
207
|
+
custom_types[version_name] = ct.model_dump()
|
|
208
|
+
|
|
174
209
|
return version_name
|
|
175
210
|
|
|
176
211
|
@staticmethod
|
|
@@ -184,15 +219,12 @@ class SignalSchema:
|
|
|
184
219
|
if st is None or not ModelStore.is_pydantic(st):
|
|
185
220
|
continue
|
|
186
221
|
# Register and save feature types.
|
|
187
|
-
ModelStore.register(st)
|
|
188
222
|
st_version_name = ModelStore.get_name(st)
|
|
189
223
|
if st is fr:
|
|
190
224
|
# If the main type is Pydantic, then use the ModelStore version name.
|
|
191
225
|
type_name = st_version_name
|
|
192
226
|
# Save this type to custom_types.
|
|
193
|
-
SignalSchema.
|
|
194
|
-
st_version_name, st, custom_types
|
|
195
|
-
)
|
|
227
|
+
SignalSchema._serialize_custom_model(st_version_name, st, custom_types)
|
|
196
228
|
return type_name
|
|
197
229
|
|
|
198
230
|
def serialize(self) -> dict[str, Any]:
|
|
@@ -215,7 +247,7 @@ class SignalSchema:
|
|
|
215
247
|
depth += 1
|
|
216
248
|
elif c == "]":
|
|
217
249
|
if depth == 0:
|
|
218
|
-
raise
|
|
250
|
+
raise ValueError(
|
|
219
251
|
"Extra closing square bracket when parsing subtype list"
|
|
220
252
|
)
|
|
221
253
|
depth -= 1
|
|
@@ -223,16 +255,51 @@ class SignalSchema:
|
|
|
223
255
|
subtypes.append(type_name[start:i].strip())
|
|
224
256
|
start = i + 1
|
|
225
257
|
if depth > 0:
|
|
226
|
-
raise
|
|
258
|
+
raise ValueError("Unclosed square bracket when parsing subtype list")
|
|
227
259
|
subtypes.append(type_name[start:].strip())
|
|
228
260
|
return subtypes
|
|
229
261
|
|
|
230
262
|
@staticmethod
|
|
231
|
-
def
|
|
263
|
+
def _deserialize_custom_type(
|
|
264
|
+
type_name: str, custom_types: dict[str, Any]
|
|
265
|
+
) -> Optional[type]:
|
|
266
|
+
"""Given a type name like MyType@v1 gets a type from ModelStore or recreates
|
|
267
|
+
it based on the information from the custom types dict that includes fields and
|
|
268
|
+
bases."""
|
|
269
|
+
model_name, version = ModelStore.parse_name_version(type_name)
|
|
270
|
+
fr = ModelStore.get(model_name, version)
|
|
271
|
+
if fr:
|
|
272
|
+
return fr
|
|
273
|
+
|
|
274
|
+
if type_name in custom_types:
|
|
275
|
+
ct = CustomType.deserialize(custom_types[type_name], type_name)
|
|
276
|
+
|
|
277
|
+
fields = {
|
|
278
|
+
field_name: SignalSchema._resolve_type(field_type_str, custom_types)
|
|
279
|
+
for field_name, field_type_str in ct.fields.items()
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
base_model = None
|
|
283
|
+
for base in ct.bases:
|
|
284
|
+
_, _, model_store_name = base
|
|
285
|
+
if model_store_name:
|
|
286
|
+
model_name, version = ModelStore.parse_name_version(
|
|
287
|
+
model_store_name
|
|
288
|
+
)
|
|
289
|
+
base_model = ModelStore.get(model_name, version)
|
|
290
|
+
if base_model:
|
|
291
|
+
break
|
|
292
|
+
|
|
293
|
+
return create_feature_model(type_name, fields, base=base_model)
|
|
294
|
+
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
@staticmethod
|
|
298
|
+
def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]:
|
|
232
299
|
"""Convert a string-based type back into a python type."""
|
|
233
300
|
type_name = type_name.strip()
|
|
234
301
|
if not type_name:
|
|
235
|
-
raise
|
|
302
|
+
raise ValueError("Type cannot be empty")
|
|
236
303
|
if type_name == "NoneType":
|
|
237
304
|
return None
|
|
238
305
|
|
|
@@ -240,14 +307,14 @@ class SignalSchema:
|
|
|
240
307
|
subtypes: Optional[tuple[Optional[type], ...]] = None
|
|
241
308
|
if bracket_idx > -1:
|
|
242
309
|
if bracket_idx == 0:
|
|
243
|
-
raise
|
|
310
|
+
raise ValueError("Type cannot start with '['")
|
|
244
311
|
close_bracket_idx = type_name.rfind("]")
|
|
245
312
|
if close_bracket_idx == -1:
|
|
246
|
-
raise
|
|
313
|
+
raise ValueError("Unclosed square bracket when parsing type")
|
|
247
314
|
if close_bracket_idx < bracket_idx:
|
|
248
|
-
raise
|
|
315
|
+
raise ValueError("Square brackets are out of order when parsing type")
|
|
249
316
|
if close_bracket_idx == bracket_idx + 1:
|
|
250
|
-
raise
|
|
317
|
+
raise ValueError("Empty square brackets when parsing type")
|
|
251
318
|
subtype_names = SignalSchema._split_subtypes(
|
|
252
319
|
type_name[bracket_idx + 1 : close_bracket_idx]
|
|
253
320
|
)
|
|
@@ -267,18 +334,10 @@ class SignalSchema:
|
|
|
267
334
|
return fr[subtypes] # type: ignore[index]
|
|
268
335
|
return fr # type: ignore[return-value]
|
|
269
336
|
|
|
270
|
-
|
|
271
|
-
fr = ModelStore.get(model_name, version)
|
|
337
|
+
fr = SignalSchema._deserialize_custom_type(type_name, custom_types)
|
|
272
338
|
if fr:
|
|
273
339
|
return fr
|
|
274
340
|
|
|
275
|
-
if type_name in custom_types:
|
|
276
|
-
fields = custom_types[type_name]
|
|
277
|
-
fields = {
|
|
278
|
-
field_name: SignalSchema._resolve_type(field_type_str, custom_types)
|
|
279
|
-
for field_name, field_type_str in fields.items()
|
|
280
|
-
}
|
|
281
|
-
return create_feature_model(type_name, fields)
|
|
282
341
|
# This can occur if a third-party or custom type is used, which is not available
|
|
283
342
|
# when deserializing.
|
|
284
343
|
warnings.warn(
|
|
@@ -317,7 +376,7 @@ class SignalSchema:
|
|
|
317
376
|
stacklevel=2,
|
|
318
377
|
)
|
|
319
378
|
continue
|
|
320
|
-
except
|
|
379
|
+
except ValueError as err:
|
|
321
380
|
raise SignalSchemaError(
|
|
322
381
|
f"cannot deserialize '{signal}': {err}"
|
|
323
382
|
) from err
|
|
@@ -662,6 +721,9 @@ class SignalSchema:
|
|
|
662
721
|
stacklevel=2,
|
|
663
722
|
)
|
|
664
723
|
return "Any"
|
|
724
|
+
if ModelStore.is_pydantic(type_):
|
|
725
|
+
ModelStore.register(type_)
|
|
726
|
+
return ModelStore.get_name(type_)
|
|
665
727
|
return type_.__name__
|
|
666
728
|
|
|
667
729
|
@staticmethod
|
datachain/listing.py
CHANGED
|
@@ -157,11 +157,7 @@ class Listing:
|
|
|
157
157
|
|
|
158
158
|
counter = 0
|
|
159
159
|
for node in all_nodes:
|
|
160
|
-
|
|
161
|
-
dst_dir = os.path.dirname(dst)
|
|
162
|
-
os.makedirs(dst_dir, exist_ok=True)
|
|
163
|
-
file = node.n.to_file(self.client.uri)
|
|
164
|
-
self.client.instantiate_object(file, dst, progress_bar, force)
|
|
160
|
+
node.instantiate(self.client, output, progress_bar, force=force)
|
|
165
161
|
counter += 1
|
|
166
162
|
if counter > 1000:
|
|
167
163
|
progress_bar.update(counter)
|
datachain/node.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from datetime import datetime
|
|
2
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
3
4
|
|
|
@@ -10,6 +11,8 @@ from datachain.utils import TIME_ZERO, time_to_str
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
12
|
from typing_extensions import Self
|
|
12
13
|
|
|
14
|
+
from datachain.client import Client
|
|
15
|
+
|
|
13
16
|
|
|
14
17
|
class DirType:
|
|
15
18
|
FILE = 0
|
|
@@ -114,7 +117,21 @@ class Node:
|
|
|
114
117
|
)
|
|
115
118
|
|
|
116
119
|
@classmethod
|
|
117
|
-
def
|
|
120
|
+
def from_file(cls, f: File) -> "Self":
|
|
121
|
+
return cls(
|
|
122
|
+
source=StorageURI(f.source),
|
|
123
|
+
path=f.path,
|
|
124
|
+
etag=f.etag,
|
|
125
|
+
is_latest=f.is_latest,
|
|
126
|
+
size=f.size,
|
|
127
|
+
last_modified=f.last_modified,
|
|
128
|
+
version=f.version,
|
|
129
|
+
location=str(f.location) if f.location else None,
|
|
130
|
+
dir_type=DirType.FILE,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def from_row(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
|
|
118
135
|
def _dval(field_name: str):
|
|
119
136
|
return d.get(f"{file_prefix}__{field_name}")
|
|
120
137
|
|
|
@@ -174,6 +191,15 @@ class NodeWithPath:
|
|
|
174
191
|
path += "/"
|
|
175
192
|
return path
|
|
176
193
|
|
|
194
|
+
def instantiate(
|
|
195
|
+
self, client: "Client", output: str, progress_bar, *, force: bool = False
|
|
196
|
+
):
|
|
197
|
+
dst = os.path.join(output, *self.path)
|
|
198
|
+
dst_dir = os.path.dirname(dst)
|
|
199
|
+
os.makedirs(dst_dir, exist_ok=True)
|
|
200
|
+
file = self.n.to_file(client.uri)
|
|
201
|
+
client.instantiate_object(file, dst, progress_bar, force)
|
|
202
|
+
|
|
177
203
|
|
|
178
204
|
TIME_FMT = "%Y-%m-%d %H:%M"
|
|
179
205
|
|
datachain/progress.py
CHANGED
|
@@ -61,7 +61,7 @@ class Tqdm(tqdm):
|
|
|
61
61
|
disable : If (default: None) or False,
|
|
62
62
|
will be determined by logging level.
|
|
63
63
|
May be overridden to `True` due to non-TTY status.
|
|
64
|
-
Skip override by specifying env var `
|
|
64
|
+
Skip override by specifying env var `DATACHAIN_IGNORE_ISATTY`.
|
|
65
65
|
kwargs : anything accepted by `tqdm.tqdm()`
|
|
66
66
|
"""
|
|
67
67
|
kwargs = kwargs.copy()
|
|
@@ -77,7 +77,7 @@ class Tqdm(tqdm):
|
|
|
77
77
|
# auto-disable based on TTY
|
|
78
78
|
if (
|
|
79
79
|
not disable
|
|
80
|
-
and not env2bool("
|
|
80
|
+
and not env2bool("DATACHAIN_IGNORE_ISATTY")
|
|
81
81
|
and hasattr(file, "isatty")
|
|
82
82
|
):
|
|
83
83
|
disable = not file.isatty()
|
datachain/query/session.py
CHANGED
|
@@ -55,7 +55,7 @@ class Session:
|
|
|
55
55
|
client_config: Optional[dict] = None,
|
|
56
56
|
in_memory: bool = False,
|
|
57
57
|
):
|
|
58
|
-
if re.match(r"^[0-9a-zA-Z]
|
|
58
|
+
if re.match(r"^[0-9a-zA-Z]*$", name) is None:
|
|
59
59
|
raise ValueError(
|
|
60
60
|
f"Session name can contain only letters or numbers - '{name}' given."
|
|
61
61
|
)
|
datachain/studio.py
CHANGED
|
@@ -20,21 +20,7 @@ POST_LOGIN_MESSAGE = (
|
|
|
20
20
|
)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def
|
|
24
|
-
if args.cmd == "login":
|
|
25
|
-
return login(args)
|
|
26
|
-
if args.cmd == "logout":
|
|
27
|
-
return logout()
|
|
28
|
-
if args.cmd == "token":
|
|
29
|
-
return token()
|
|
30
|
-
if args.cmd == "datasets":
|
|
31
|
-
rows = [
|
|
32
|
-
{"Name": name, "Version": version}
|
|
33
|
-
for name, version in list_datasets(args.team)
|
|
34
|
-
]
|
|
35
|
-
print(tabulate(rows, headers="keys"))
|
|
36
|
-
return 0
|
|
37
|
-
|
|
23
|
+
def process_jobs_args(args: "Namespace"):
|
|
38
24
|
if args.cmd == "run":
|
|
39
25
|
return create_job(
|
|
40
26
|
args.query_file,
|
|
@@ -50,6 +36,25 @@ def process_studio_cli_args(args: "Namespace"): # noqa: PLR0911
|
|
|
50
36
|
|
|
51
37
|
if args.cmd == "cancel":
|
|
52
38
|
return cancel_job(args.job_id, args.team)
|
|
39
|
+
if args.cmd == "logs":
|
|
40
|
+
return show_job_logs(args.job_id, args.team)
|
|
41
|
+
raise DataChainError(f"Unknown command '{args.cmd}'.")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def process_studio_cli_args(args: "Namespace"):
|
|
45
|
+
if args.cmd == "login":
|
|
46
|
+
return login(args)
|
|
47
|
+
if args.cmd == "logout":
|
|
48
|
+
return logout()
|
|
49
|
+
if args.cmd == "token":
|
|
50
|
+
return token()
|
|
51
|
+
if args.cmd == "dataset":
|
|
52
|
+
rows = [
|
|
53
|
+
{"Name": name, "Version": version}
|
|
54
|
+
for name, version in list_datasets(args.team)
|
|
55
|
+
]
|
|
56
|
+
print(tabulate(rows, headers="keys"))
|
|
57
|
+
return 0
|
|
53
58
|
|
|
54
59
|
if args.cmd == "team":
|
|
55
60
|
return set_team(args)
|
|
@@ -187,6 +192,32 @@ def save_config(hostname, token):
|
|
|
187
192
|
return config.config_file()
|
|
188
193
|
|
|
189
194
|
|
|
195
|
+
def show_logs_from_client(client, job_id):
|
|
196
|
+
# Sync usage
|
|
197
|
+
async def _run():
|
|
198
|
+
async for message in client.tail_job_logs(job_id):
|
|
199
|
+
if "logs" in message:
|
|
200
|
+
for log in message["logs"]:
|
|
201
|
+
print(log["message"], end="")
|
|
202
|
+
elif "job" in message:
|
|
203
|
+
print(f"\n>>>> Job is now in {message['job']['status']} status.")
|
|
204
|
+
|
|
205
|
+
asyncio.run(_run())
|
|
206
|
+
|
|
207
|
+
response = client.dataset_job_versions(job_id)
|
|
208
|
+
if not response.ok:
|
|
209
|
+
raise_remote_error(response.message)
|
|
210
|
+
|
|
211
|
+
response_data = response.data
|
|
212
|
+
if response_data:
|
|
213
|
+
dataset_versions = response_data.get("dataset_versions", [])
|
|
214
|
+
print("\n\n>>>> Dataset versions created during the job:")
|
|
215
|
+
for version in dataset_versions:
|
|
216
|
+
print(f" - {version.get('dataset_name')}@v{version.get('version')}")
|
|
217
|
+
else:
|
|
218
|
+
print("No dataset versions created during the job.")
|
|
219
|
+
|
|
220
|
+
|
|
190
221
|
def create_job(
|
|
191
222
|
query_file: str,
|
|
192
223
|
team_name: Optional[str],
|
|
@@ -236,29 +267,7 @@ def create_job(
|
|
|
236
267
|
print("Open the job in Studio at", response.data.get("job", {}).get("url"))
|
|
237
268
|
print("=" * 40)
|
|
238
269
|
|
|
239
|
-
|
|
240
|
-
async def _run():
|
|
241
|
-
async for message in client.tail_job_logs(job_id):
|
|
242
|
-
if "logs" in message:
|
|
243
|
-
for log in message["logs"]:
|
|
244
|
-
print(log["message"], end="")
|
|
245
|
-
elif "job" in message:
|
|
246
|
-
print(f"\n>>>> Job is now in {message['job']['status']} status.")
|
|
247
|
-
|
|
248
|
-
asyncio.run(_run())
|
|
249
|
-
|
|
250
|
-
response = client.dataset_job_versions(job_id)
|
|
251
|
-
if not response.ok:
|
|
252
|
-
raise_remote_error(response.message)
|
|
253
|
-
|
|
254
|
-
response_data = response.data
|
|
255
|
-
if response_data:
|
|
256
|
-
dataset_versions = response_data.get("dataset_versions", [])
|
|
257
|
-
print("\n\n>>>> Dataset versions created during the job:")
|
|
258
|
-
for version in dataset_versions:
|
|
259
|
-
print(f" - {version.get('dataset_name')}@v{version.get('version')}")
|
|
260
|
-
else:
|
|
261
|
-
print("No dataset versions created during the job.")
|
|
270
|
+
show_logs_from_client(client, job_id)
|
|
262
271
|
|
|
263
272
|
|
|
264
273
|
def upload_files(client: StudioClient, files: list[str]) -> list[str]:
|
|
@@ -293,3 +302,14 @@ def cancel_job(job_id: str, team_name: Optional[str]):
|
|
|
293
302
|
raise_remote_error(response.message)
|
|
294
303
|
|
|
295
304
|
print(f"Job {job_id} canceled")
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def show_job_logs(job_id: str, team_name: Optional[str]):
|
|
308
|
+
token = Config().read().get("studio", {}).get("token")
|
|
309
|
+
if not token:
|
|
310
|
+
raise DataChainError(
|
|
311
|
+
"Not logged in to Studio. Log in with 'datachain studio login'."
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
client = StudioClient(team=team_name)
|
|
315
|
+
show_logs_from_client(client, job_id)
|
datachain/utils.py
CHANGED
|
@@ -30,7 +30,7 @@ APPNAME = "datachain"
|
|
|
30
30
|
APPAUTHOR = "iterative"
|
|
31
31
|
ENV_DATACHAIN_SYSTEM_CONFIG_DIR = "DATACHAIN_SYSTEM_CONFIG_DIR"
|
|
32
32
|
ENV_DATACHAIN_GLOBAL_CONFIG_DIR = "DATACHAIN_GLOBAL_CONFIG_DIR"
|
|
33
|
-
STUDIO_URL = "https://studio.
|
|
33
|
+
STUDIO_URL = "https://studio.datachain.ai"
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
T = TypeVar("T", bound="DataChainDir")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.4
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -50,7 +50,7 @@ Requires-Dist: websockets
|
|
|
50
50
|
Provides-Extra: docs
|
|
51
51
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
52
52
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
53
|
-
Requires-Dist: mkdocs-material
|
|
53
|
+
Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
|
|
54
54
|
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
55
55
|
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
56
56
|
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
@@ -72,7 +72,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
72
72
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
73
73
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
74
74
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
75
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
75
|
+
Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
|
|
76
76
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
77
77
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
78
78
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -84,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
84
84
|
Requires-Dist: scipy; extra == "tests"
|
|
85
85
|
Provides-Extra: dev
|
|
86
86
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
87
|
-
Requires-Dist: mypy==1.14.
|
|
87
|
+
Requires-Dist: mypy==1.14.1; extra == "dev"
|
|
88
88
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
89
89
|
Requires-Dist: types-pytz; extra == "dev"
|
|
90
90
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -95,11 +95,11 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
95
95
|
Requires-Dist: defusedxml; extra == "examples"
|
|
96
96
|
Requires-Dist: accelerate; extra == "examples"
|
|
97
97
|
Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
|
|
98
|
-
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
98
|
+
Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
|
|
99
99
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
100
100
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
101
101
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
102
|
-
Requires-Dist: ultralytics==8.3.
|
|
102
|
+
Requires-Dist: ultralytics==8.3.55; extra == "examples"
|
|
103
103
|
|
|
104
104
|
================
|
|
105
105
|
|logo| DataChain
|