datachain 0.34.6__py3-none-any.whl → 0.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/catalog.py +75 -83
- datachain/catalog/loader.py +3 -3
- datachain/checkpoint.py +1 -2
- datachain/cli/__init__.py +2 -4
- datachain/cli/commands/datasets.py +13 -13
- datachain/cli/commands/ls.py +4 -4
- datachain/cli/commands/query.py +3 -3
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +1 -2
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +11 -21
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +4 -4
- datachain/client/local.py +4 -4
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +5 -5
- datachain/data_storage/metastore.py +107 -107
- datachain/data_storage/schema.py +18 -24
- datachain/data_storage/sqlite.py +21 -28
- datachain/data_storage/warehouse.py +13 -13
- datachain/dataset.py +64 -70
- datachain/delta.py +21 -18
- datachain/diff/__init__.py +13 -13
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +45 -42
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +54 -81
- datachain/job.py +8 -8
- datachain/lib/arrow.py +17 -14
- datachain/lib/audio.py +6 -6
- datachain/lib/clip.py +5 -4
- datachain/lib/convert/python_to_sql.py +4 -22
- datachain/lib/convert/values_to_tuples.py +4 -9
- datachain/lib/data_model.py +20 -19
- datachain/lib/dataset_info.py +6 -6
- datachain/lib/dc/csv.py +10 -10
- datachain/lib/dc/database.py +28 -29
- datachain/lib/dc/datachain.py +98 -97
- datachain/lib/dc/datasets.py +22 -22
- datachain/lib/dc/hf.py +4 -4
- datachain/lib/dc/json.py +9 -10
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +5 -5
- datachain/lib/dc/storage.py +12 -12
- datachain/lib/dc/storage_pattern.py +2 -2
- datachain/lib/dc/utils.py +11 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +32 -28
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +1 -2
- datachain/lib/model_store.py +3 -3
- datachain/lib/namespaces.py +4 -6
- datachain/lib/projects.py +5 -9
- datachain/lib/pytorch.py +10 -10
- datachain/lib/settings.py +23 -23
- datachain/lib/signal_schema.py +52 -44
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +25 -17
- datachain/lib/udf_signature.py +11 -11
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +30 -35
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +4 -4
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +4 -4
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +1 -7
- datachain/project.py +4 -4
- datachain/query/batch.py +7 -8
- datachain/query/dataset.py +80 -87
- datachain/query/dispatch.py +7 -7
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/schema.py +7 -6
- datachain/query/session.py +7 -7
- datachain/query/udf.py +8 -7
- datachain/query/utils.py +3 -5
- datachain/remote/studio.py +33 -39
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +6 -9
- datachain/studio.py +30 -30
- datachain/toolkit/split.py +1 -2
- datachain/utils.py +21 -21
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/METADATA +2 -3
- datachain-0.35.0.dist-info/RECORD +173 -0
- datachain-0.34.6.dist-info/RECORD +0 -173
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/WHEEL +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from collections.abc import Iterator
|
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
8
|
from functools import cached_property, reduce
|
|
9
9
|
from itertools import groupby
|
|
10
|
-
from typing import TYPE_CHECKING, Any
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
11
|
from uuid import uuid4
|
|
12
12
|
|
|
13
13
|
from sqlalchemy import (
|
|
@@ -83,7 +83,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
83
83
|
|
|
84
84
|
def __init__(
|
|
85
85
|
self,
|
|
86
|
-
uri:
|
|
86
|
+
uri: StorageURI | None = None,
|
|
87
87
|
):
|
|
88
88
|
self.uri = uri or StorageURI("")
|
|
89
89
|
|
|
@@ -97,7 +97,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
97
97
|
@abstractmethod
|
|
98
98
|
def clone(
|
|
99
99
|
self,
|
|
100
|
-
uri:
|
|
100
|
+
uri: StorageURI | None = None,
|
|
101
101
|
use_new_connection: bool = False,
|
|
102
102
|
) -> "AbstractMetastore":
|
|
103
103
|
"""Clones AbstractMetastore implementation for some Storage input.
|
|
@@ -137,8 +137,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
137
137
|
def create_namespace(
|
|
138
138
|
self,
|
|
139
139
|
name: str,
|
|
140
|
-
description:
|
|
141
|
-
uuid:
|
|
140
|
+
description: str | None = None,
|
|
141
|
+
uuid: str | None = None,
|
|
142
142
|
ignore_if_exists: bool = True,
|
|
143
143
|
validate: bool = True,
|
|
144
144
|
**kwargs,
|
|
@@ -185,8 +185,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
185
185
|
self,
|
|
186
186
|
namespace_name: str,
|
|
187
187
|
name: str,
|
|
188
|
-
description:
|
|
189
|
-
uuid:
|
|
188
|
+
description: str | None = None,
|
|
189
|
+
uuid: str | None = None,
|
|
190
190
|
ignore_if_exists: bool = True,
|
|
191
191
|
validate: bool = True,
|
|
192
192
|
**kwargs,
|
|
@@ -219,7 +219,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
219
219
|
"""Gets a single project by id"""
|
|
220
220
|
|
|
221
221
|
@abstractmethod
|
|
222
|
-
def count_projects(self, namespace_id:
|
|
222
|
+
def count_projects(self, namespace_id: int | None = None) -> int:
|
|
223
223
|
"""Counts projects in some namespace or in general."""
|
|
224
224
|
|
|
225
225
|
@abstractmethod
|
|
@@ -227,7 +227,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
227
227
|
"""Removes a single project by id"""
|
|
228
228
|
|
|
229
229
|
@abstractmethod
|
|
230
|
-
def list_projects(self, namespace_id:
|
|
230
|
+
def list_projects(self, namespace_id: int | None, conn=None) -> list[Project]:
|
|
231
231
|
"""Gets list of projects in some namespace or in general (in all namespaces)"""
|
|
232
232
|
|
|
233
233
|
#
|
|
@@ -237,15 +237,15 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
237
237
|
def create_dataset(
|
|
238
238
|
self,
|
|
239
239
|
name: str,
|
|
240
|
-
project_id:
|
|
240
|
+
project_id: int | None = None,
|
|
241
241
|
status: int = DatasetStatus.CREATED,
|
|
242
|
-
sources:
|
|
243
|
-
feature_schema:
|
|
242
|
+
sources: list[str] | None = None,
|
|
243
|
+
feature_schema: dict | None = None,
|
|
244
244
|
query_script: str = "",
|
|
245
|
-
schema:
|
|
245
|
+
schema: dict[str, Any] | None = None,
|
|
246
246
|
ignore_if_exists: bool = False,
|
|
247
|
-
description:
|
|
248
|
-
attrs:
|
|
247
|
+
description: str | None = None,
|
|
248
|
+
attrs: list[str] | None = None,
|
|
249
249
|
) -> DatasetRecord:
|
|
250
250
|
"""Creates new dataset."""
|
|
251
251
|
|
|
@@ -256,20 +256,20 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
256
256
|
version: str,
|
|
257
257
|
status: int,
|
|
258
258
|
sources: str = "",
|
|
259
|
-
feature_schema:
|
|
259
|
+
feature_schema: dict | None = None,
|
|
260
260
|
query_script: str = "",
|
|
261
261
|
error_message: str = "",
|
|
262
262
|
error_stack: str = "",
|
|
263
263
|
script_output: str = "",
|
|
264
|
-
created_at:
|
|
265
|
-
finished_at:
|
|
266
|
-
schema:
|
|
264
|
+
created_at: datetime | None = None,
|
|
265
|
+
finished_at: datetime | None = None,
|
|
266
|
+
schema: dict[str, Any] | None = None,
|
|
267
267
|
ignore_if_exists: bool = False,
|
|
268
|
-
num_objects:
|
|
269
|
-
size:
|
|
270
|
-
preview:
|
|
271
|
-
job_id:
|
|
272
|
-
uuid:
|
|
268
|
+
num_objects: int | None = None,
|
|
269
|
+
size: int | None = None,
|
|
270
|
+
preview: list[dict] | None = None,
|
|
271
|
+
job_id: str | None = None,
|
|
272
|
+
uuid: str | None = None,
|
|
273
273
|
) -> DatasetRecord:
|
|
274
274
|
"""Creates new dataset version."""
|
|
275
275
|
|
|
@@ -298,17 +298,17 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
298
298
|
|
|
299
299
|
@abstractmethod
|
|
300
300
|
def list_datasets(
|
|
301
|
-
self, project_id:
|
|
301
|
+
self, project_id: int | None = None
|
|
302
302
|
) -> Iterator[DatasetListRecord]:
|
|
303
303
|
"""Lists all datasets in some project or in all projects."""
|
|
304
304
|
|
|
305
305
|
@abstractmethod
|
|
306
|
-
def count_datasets(self, project_id:
|
|
306
|
+
def count_datasets(self, project_id: int | None = None) -> int:
|
|
307
307
|
"""Counts datasets in some project or in all projects."""
|
|
308
308
|
|
|
309
309
|
@abstractmethod
|
|
310
310
|
def list_datasets_by_prefix(
|
|
311
|
-
self, prefix: str, project_id:
|
|
311
|
+
self, prefix: str, project_id: int | None = None
|
|
312
312
|
) -> Iterator["DatasetListRecord"]:
|
|
313
313
|
"""
|
|
314
314
|
Lists all datasets which names start with prefix in some project or in all
|
|
@@ -319,8 +319,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
319
319
|
def get_dataset(
|
|
320
320
|
self,
|
|
321
321
|
name: str, # normal, not full dataset name
|
|
322
|
-
namespace_name:
|
|
323
|
-
project_name:
|
|
322
|
+
namespace_name: str | None = None,
|
|
323
|
+
project_name: str | None = None,
|
|
324
324
|
conn=None,
|
|
325
325
|
) -> DatasetRecord:
|
|
326
326
|
"""Gets a single dataset by name."""
|
|
@@ -330,7 +330,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
330
330
|
self,
|
|
331
331
|
dataset: DatasetRecord,
|
|
332
332
|
status: int,
|
|
333
|
-
version:
|
|
333
|
+
version: str | None = None,
|
|
334
334
|
error_message="",
|
|
335
335
|
error_stack="",
|
|
336
336
|
script_output="",
|
|
@@ -355,20 +355,20 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
355
355
|
self,
|
|
356
356
|
source_dataset: DatasetRecord,
|
|
357
357
|
source_dataset_version: str,
|
|
358
|
-
new_source_dataset:
|
|
359
|
-
new_source_dataset_version:
|
|
358
|
+
new_source_dataset: DatasetRecord | None = None,
|
|
359
|
+
new_source_dataset_version: str | None = None,
|
|
360
360
|
) -> None:
|
|
361
361
|
"""Updates dataset dependency source."""
|
|
362
362
|
|
|
363
363
|
@abstractmethod
|
|
364
364
|
def get_direct_dataset_dependencies(
|
|
365
365
|
self, dataset: DatasetRecord, version: str
|
|
366
|
-
) -> list[
|
|
366
|
+
) -> list[DatasetDependency | None]:
|
|
367
367
|
"""Gets direct dataset dependencies."""
|
|
368
368
|
|
|
369
369
|
@abstractmethod
|
|
370
370
|
def remove_dataset_dependencies(
|
|
371
|
-
self, dataset: DatasetRecord, version:
|
|
371
|
+
self, dataset: DatasetRecord, version: str | None = None
|
|
372
372
|
) -> None:
|
|
373
373
|
"""
|
|
374
374
|
When we remove dataset, we need to clean up it's dependencies as well.
|
|
@@ -376,7 +376,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
376
376
|
|
|
377
377
|
@abstractmethod
|
|
378
378
|
def remove_dataset_dependants(
|
|
379
|
-
self, dataset: DatasetRecord, version:
|
|
379
|
+
self, dataset: DatasetRecord, version: str | None = None
|
|
380
380
|
) -> None:
|
|
381
381
|
"""
|
|
382
382
|
When we remove dataset, we need to clear its references in other dataset
|
|
@@ -398,9 +398,9 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
398
398
|
query_type: JobQueryType = JobQueryType.PYTHON,
|
|
399
399
|
status: JobStatus = JobStatus.CREATED,
|
|
400
400
|
workers: int = 1,
|
|
401
|
-
python_version:
|
|
402
|
-
params:
|
|
403
|
-
parent_job_id:
|
|
401
|
+
python_version: str | None = None,
|
|
402
|
+
params: dict[str, str] | None = None,
|
|
403
|
+
parent_job_id: str | None = None,
|
|
404
404
|
) -> str:
|
|
405
405
|
"""
|
|
406
406
|
Creates a new job.
|
|
@@ -408,19 +408,19 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
408
408
|
"""
|
|
409
409
|
|
|
410
410
|
@abstractmethod
|
|
411
|
-
def get_job(self, job_id: str) ->
|
|
411
|
+
def get_job(self, job_id: str) -> Job | None:
|
|
412
412
|
"""Returns the job with the given ID."""
|
|
413
413
|
|
|
414
414
|
@abstractmethod
|
|
415
415
|
def update_job(
|
|
416
416
|
self,
|
|
417
417
|
job_id: str,
|
|
418
|
-
status:
|
|
419
|
-
error_message:
|
|
420
|
-
error_stack:
|
|
421
|
-
finished_at:
|
|
422
|
-
metrics:
|
|
423
|
-
) ->
|
|
418
|
+
status: JobStatus | None = None,
|
|
419
|
+
error_message: str | None = None,
|
|
420
|
+
error_stack: str | None = None,
|
|
421
|
+
finished_at: datetime | None = None,
|
|
422
|
+
metrics: dict[str, Any] | None = None,
|
|
423
|
+
) -> Job | None:
|
|
424
424
|
"""Updates job fields."""
|
|
425
425
|
|
|
426
426
|
@abstractmethod
|
|
@@ -428,13 +428,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
428
428
|
self,
|
|
429
429
|
job_id: str,
|
|
430
430
|
status: JobStatus,
|
|
431
|
-
error_message:
|
|
432
|
-
error_stack:
|
|
431
|
+
error_message: str | None = None,
|
|
432
|
+
error_stack: str | None = None,
|
|
433
433
|
) -> None:
|
|
434
434
|
"""Set the status of the given job."""
|
|
435
435
|
|
|
436
436
|
@abstractmethod
|
|
437
|
-
def get_job_status(self, job_id: str) ->
|
|
437
|
+
def get_job_status(self, job_id: str) -> JobStatus | None:
|
|
438
438
|
"""Returns the status of the given job."""
|
|
439
439
|
|
|
440
440
|
#
|
|
@@ -442,11 +442,11 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
442
442
|
#
|
|
443
443
|
|
|
444
444
|
@abstractmethod
|
|
445
|
-
def list_checkpoints(self, job_id: str, conn=None) -> Iterator[
|
|
445
|
+
def list_checkpoints(self, job_id: str, conn=None) -> Iterator[Checkpoint]:
|
|
446
446
|
"""Returns all checkpoints related to some job"""
|
|
447
447
|
|
|
448
448
|
@abstractmethod
|
|
449
|
-
def get_last_checkpoint(self, job_id: str, conn=None) ->
|
|
449
|
+
def get_last_checkpoint(self, job_id: str, conn=None) -> Checkpoint | None:
|
|
450
450
|
"""Get last created checkpoint for some job."""
|
|
451
451
|
|
|
452
452
|
@abstractmethod
|
|
@@ -455,7 +455,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
455
455
|
|
|
456
456
|
def find_checkpoint(
|
|
457
457
|
self, job_id: str, _hash: str, partial: bool = False, conn=None
|
|
458
|
-
) ->
|
|
458
|
+
) -> Checkpoint | None:
|
|
459
459
|
"""
|
|
460
460
|
Tries to find checkpoint for a job with specific hash and optionally partial
|
|
461
461
|
"""
|
|
@@ -466,7 +466,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
466
466
|
job_id: str,
|
|
467
467
|
_hash: str,
|
|
468
468
|
partial: bool = False,
|
|
469
|
-
conn:
|
|
469
|
+
conn: Any | None = None,
|
|
470
470
|
) -> Checkpoint:
|
|
471
471
|
"""Creates new checkpoint"""
|
|
472
472
|
|
|
@@ -489,7 +489,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
489
489
|
|
|
490
490
|
db: "DatabaseEngine"
|
|
491
491
|
|
|
492
|
-
def __init__(self, uri:
|
|
492
|
+
def __init__(self, uri: StorageURI | None = None):
|
|
493
493
|
uri = uri or StorageURI("")
|
|
494
494
|
super().__init__(uri)
|
|
495
495
|
|
|
@@ -781,8 +781,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
781
781
|
def create_namespace(
|
|
782
782
|
self,
|
|
783
783
|
name: str,
|
|
784
|
-
description:
|
|
785
|
-
uuid:
|
|
784
|
+
description: str | None = None,
|
|
785
|
+
uuid: str | None = None,
|
|
786
786
|
ignore_if_exists: bool = True,
|
|
787
787
|
validate: bool = True,
|
|
788
788
|
**kwargs,
|
|
@@ -846,8 +846,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
846
846
|
self,
|
|
847
847
|
namespace_name: str,
|
|
848
848
|
name: str,
|
|
849
|
-
description:
|
|
850
|
-
uuid:
|
|
849
|
+
description: str | None = None,
|
|
850
|
+
uuid: str | None = None,
|
|
851
851
|
ignore_if_exists: bool = True,
|
|
852
852
|
validate: bool = True,
|
|
853
853
|
**kwargs,
|
|
@@ -925,7 +925,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
925
925
|
raise ProjectNotFoundError(f"Project with id {project_id} not found.")
|
|
926
926
|
return self.project_class.parse(*rows[0])
|
|
927
927
|
|
|
928
|
-
def count_projects(self, namespace_id:
|
|
928
|
+
def count_projects(self, namespace_id: int | None = None) -> int:
|
|
929
929
|
p = self._projects
|
|
930
930
|
|
|
931
931
|
query = self._projects_base_query()
|
|
@@ -949,7 +949,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
949
949
|
self.db.execute(self._projects_delete().where(p.c.id == project_id))
|
|
950
950
|
|
|
951
951
|
def list_projects(
|
|
952
|
-
self, namespace_id:
|
|
952
|
+
self, namespace_id: int | None = None, conn=None
|
|
953
953
|
) -> list[Project]:
|
|
954
954
|
"""
|
|
955
955
|
Gets a list of projects inside some namespace, or in all namespaces
|
|
@@ -972,15 +972,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
972
972
|
def create_dataset(
|
|
973
973
|
self,
|
|
974
974
|
name: str,
|
|
975
|
-
project_id:
|
|
975
|
+
project_id: int | None = None,
|
|
976
976
|
status: int = DatasetStatus.CREATED,
|
|
977
|
-
sources:
|
|
978
|
-
feature_schema:
|
|
977
|
+
sources: list[str] | None = None,
|
|
978
|
+
feature_schema: dict | None = None,
|
|
979
979
|
query_script: str = "",
|
|
980
|
-
schema:
|
|
980
|
+
schema: dict[str, Any] | None = None,
|
|
981
981
|
ignore_if_exists: bool = False,
|
|
982
|
-
description:
|
|
983
|
-
attrs:
|
|
982
|
+
description: str | None = None,
|
|
983
|
+
attrs: list[str] | None = None,
|
|
984
984
|
**kwargs, # TODO registered = True / False
|
|
985
985
|
) -> DatasetRecord:
|
|
986
986
|
"""Creates new dataset."""
|
|
@@ -1020,20 +1020,20 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1020
1020
|
version: str,
|
|
1021
1021
|
status: int,
|
|
1022
1022
|
sources: str = "",
|
|
1023
|
-
feature_schema:
|
|
1023
|
+
feature_schema: dict | None = None,
|
|
1024
1024
|
query_script: str = "",
|
|
1025
1025
|
error_message: str = "",
|
|
1026
1026
|
error_stack: str = "",
|
|
1027
1027
|
script_output: str = "",
|
|
1028
|
-
created_at:
|
|
1029
|
-
finished_at:
|
|
1030
|
-
schema:
|
|
1028
|
+
created_at: datetime | None = None,
|
|
1029
|
+
finished_at: datetime | None = None,
|
|
1030
|
+
schema: dict[str, Any] | None = None,
|
|
1031
1031
|
ignore_if_exists: bool = False,
|
|
1032
|
-
num_objects:
|
|
1033
|
-
size:
|
|
1034
|
-
preview:
|
|
1035
|
-
job_id:
|
|
1036
|
-
uuid:
|
|
1032
|
+
num_objects: int | None = None,
|
|
1033
|
+
size: int | None = None,
|
|
1034
|
+
preview: list[dict] | None = None,
|
|
1035
|
+
job_id: str | None = None,
|
|
1036
|
+
uuid: str | None = None,
|
|
1037
1037
|
conn=None,
|
|
1038
1038
|
) -> DatasetRecord:
|
|
1039
1039
|
"""Creates new dataset version."""
|
|
@@ -1205,13 +1205,13 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1205
1205
|
f"Dataset {dataset.name} does not have version {version}"
|
|
1206
1206
|
)
|
|
1207
1207
|
|
|
1208
|
-
def _parse_dataset(self, rows) ->
|
|
1208
|
+
def _parse_dataset(self, rows) -> DatasetRecord | None:
|
|
1209
1209
|
versions = [self.dataset_class.parse(*r) for r in rows]
|
|
1210
1210
|
if not versions:
|
|
1211
1211
|
return None
|
|
1212
1212
|
return reduce(lambda ds, version: ds.merge_versions(version), versions)
|
|
1213
1213
|
|
|
1214
|
-
def _parse_list_dataset(self, rows) ->
|
|
1214
|
+
def _parse_list_dataset(self, rows) -> DatasetListRecord | None:
|
|
1215
1215
|
versions = [self.dataset_list_class.parse(*r) for r in rows]
|
|
1216
1216
|
if not versions:
|
|
1217
1217
|
return None
|
|
@@ -1274,7 +1274,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1274
1274
|
)
|
|
1275
1275
|
|
|
1276
1276
|
def list_datasets(
|
|
1277
|
-
self, project_id:
|
|
1277
|
+
self, project_id: int | None = None
|
|
1278
1278
|
) -> Iterator["DatasetListRecord"]:
|
|
1279
1279
|
d = self._datasets
|
|
1280
1280
|
query = self._base_list_datasets_query().order_by(
|
|
@@ -1284,7 +1284,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1284
1284
|
query = query.where(d.c.project_id == project_id)
|
|
1285
1285
|
yield from self._parse_dataset_list(self.db.execute(query))
|
|
1286
1286
|
|
|
1287
|
-
def count_datasets(self, project_id:
|
|
1287
|
+
def count_datasets(self, project_id: int | None = None) -> int:
|
|
1288
1288
|
d = self._datasets
|
|
1289
1289
|
query = self._datasets_select()
|
|
1290
1290
|
if project_id:
|
|
@@ -1295,7 +1295,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1295
1295
|
return next(self.db.execute(query))[0]
|
|
1296
1296
|
|
|
1297
1297
|
def list_datasets_by_prefix(
|
|
1298
|
-
self, prefix: str, project_id:
|
|
1298
|
+
self, prefix: str, project_id: int | None = None, conn=None
|
|
1299
1299
|
) -> Iterator["DatasetListRecord"]:
|
|
1300
1300
|
d = self._datasets
|
|
1301
1301
|
query = self._base_list_datasets_query()
|
|
@@ -1307,8 +1307,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1307
1307
|
def get_dataset(
|
|
1308
1308
|
self,
|
|
1309
1309
|
name: str, # normal, not full dataset name
|
|
1310
|
-
namespace_name:
|
|
1311
|
-
project_name:
|
|
1310
|
+
namespace_name: str | None = None,
|
|
1311
|
+
project_name: str | None = None,
|
|
1312
1312
|
conn=None,
|
|
1313
1313
|
) -> DatasetRecord:
|
|
1314
1314
|
"""
|
|
@@ -1369,7 +1369,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1369
1369
|
self,
|
|
1370
1370
|
dataset: DatasetRecord,
|
|
1371
1371
|
status: int,
|
|
1372
|
-
version:
|
|
1372
|
+
version: str | None = None,
|
|
1373
1373
|
error_message="",
|
|
1374
1374
|
error_stack="",
|
|
1375
1375
|
script_output="",
|
|
@@ -1423,8 +1423,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1423
1423
|
self,
|
|
1424
1424
|
source_dataset: DatasetRecord,
|
|
1425
1425
|
source_dataset_version: str,
|
|
1426
|
-
new_source_dataset:
|
|
1427
|
-
new_source_dataset_version:
|
|
1426
|
+
new_source_dataset: DatasetRecord | None = None,
|
|
1427
|
+
new_source_dataset_version: str | None = None,
|
|
1428
1428
|
) -> None:
|
|
1429
1429
|
dd = self._datasets_dependencies
|
|
1430
1430
|
|
|
@@ -1456,7 +1456,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1456
1456
|
|
|
1457
1457
|
def get_direct_dataset_dependencies(
|
|
1458
1458
|
self, dataset: DatasetRecord, version: str
|
|
1459
|
-
) -> list[
|
|
1459
|
+
) -> list[DatasetDependency | None]:
|
|
1460
1460
|
n = self._namespaces
|
|
1461
1461
|
p = self._projects
|
|
1462
1462
|
d = self._datasets
|
|
@@ -1484,7 +1484,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1484
1484
|
return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
|
|
1485
1485
|
|
|
1486
1486
|
def remove_dataset_dependencies(
|
|
1487
|
-
self, dataset: DatasetRecord, version:
|
|
1487
|
+
self, dataset: DatasetRecord, version: str | None = None
|
|
1488
1488
|
) -> None:
|
|
1489
1489
|
"""
|
|
1490
1490
|
When we remove dataset, we need to clean up it's dependencies as well
|
|
@@ -1503,7 +1503,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1503
1503
|
self.db.execute(q)
|
|
1504
1504
|
|
|
1505
1505
|
def remove_dataset_dependants(
|
|
1506
|
-
self, dataset: DatasetRecord, version:
|
|
1506
|
+
self, dataset: DatasetRecord, version: str | None = None
|
|
1507
1507
|
) -> None:
|
|
1508
1508
|
"""
|
|
1509
1509
|
When we remove dataset, we need to clear its references in other dataset
|
|
@@ -1600,10 +1600,10 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1600
1600
|
query_type: JobQueryType = JobQueryType.PYTHON,
|
|
1601
1601
|
status: JobStatus = JobStatus.CREATED,
|
|
1602
1602
|
workers: int = 1,
|
|
1603
|
-
python_version:
|
|
1604
|
-
params:
|
|
1605
|
-
parent_job_id:
|
|
1606
|
-
conn:
|
|
1603
|
+
python_version: str | None = None,
|
|
1604
|
+
params: dict[str, str] | None = None,
|
|
1605
|
+
parent_job_id: str | None = None,
|
|
1606
|
+
conn: Any = None,
|
|
1607
1607
|
) -> str:
|
|
1608
1608
|
"""
|
|
1609
1609
|
Creates a new job.
|
|
@@ -1630,7 +1630,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1630
1630
|
)
|
|
1631
1631
|
return job_id
|
|
1632
1632
|
|
|
1633
|
-
def get_job(self, job_id: str, conn=None) ->
|
|
1633
|
+
def get_job(self, job_id: str, conn=None) -> Job | None:
|
|
1634
1634
|
"""Returns the job with the given ID."""
|
|
1635
1635
|
query = self._jobs_select(self._jobs).where(self._jobs.c.id == job_id)
|
|
1636
1636
|
results = list(self.db.execute(query, conn=conn))
|
|
@@ -1641,13 +1641,13 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1641
1641
|
def update_job(
|
|
1642
1642
|
self,
|
|
1643
1643
|
job_id: str,
|
|
1644
|
-
status:
|
|
1645
|
-
error_message:
|
|
1646
|
-
error_stack:
|
|
1647
|
-
finished_at:
|
|
1648
|
-
metrics:
|
|
1649
|
-
conn:
|
|
1650
|
-
) ->
|
|
1644
|
+
status: JobStatus | None = None,
|
|
1645
|
+
error_message: str | None = None,
|
|
1646
|
+
error_stack: str | None = None,
|
|
1647
|
+
finished_at: datetime | None = None,
|
|
1648
|
+
metrics: dict[str, Any] | None = None,
|
|
1649
|
+
conn: Any | None = None,
|
|
1650
|
+
) -> Job | None:
|
|
1651
1651
|
"""Updates job fields."""
|
|
1652
1652
|
values: dict = {}
|
|
1653
1653
|
if status is not None:
|
|
@@ -1674,9 +1674,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1674
1674
|
self,
|
|
1675
1675
|
job_id: str,
|
|
1676
1676
|
status: JobStatus,
|
|
1677
|
-
error_message:
|
|
1678
|
-
error_stack:
|
|
1679
|
-
conn:
|
|
1677
|
+
error_message: str | None = None,
|
|
1678
|
+
error_stack: str | None = None,
|
|
1679
|
+
conn: Any | None = None,
|
|
1680
1680
|
) -> None:
|
|
1681
1681
|
"""Set the status of the given job."""
|
|
1682
1682
|
values: dict = {"status": status}
|
|
@@ -1694,8 +1694,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1694
1694
|
def get_job_status(
|
|
1695
1695
|
self,
|
|
1696
1696
|
job_id: str,
|
|
1697
|
-
conn:
|
|
1698
|
-
) ->
|
|
1697
|
+
conn: Any | None = None,
|
|
1698
|
+
) -> JobStatus | None:
|
|
1699
1699
|
"""Returns the status of the given job."""
|
|
1700
1700
|
results = list(
|
|
1701
1701
|
self.db.execute(
|
|
@@ -1761,7 +1761,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1761
1761
|
job_id: str,
|
|
1762
1762
|
_hash: str,
|
|
1763
1763
|
partial: bool = False,
|
|
1764
|
-
conn:
|
|
1764
|
+
conn: Any | None = None,
|
|
1765
1765
|
) -> Checkpoint:
|
|
1766
1766
|
"""
|
|
1767
1767
|
Creates a new job query step.
|
|
@@ -1797,7 +1797,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1797
1797
|
|
|
1798
1798
|
def find_checkpoint(
|
|
1799
1799
|
self, job_id: str, _hash: str, partial: bool = False, conn=None
|
|
1800
|
-
) ->
|
|
1800
|
+
) -> Checkpoint | None:
|
|
1801
1801
|
"""
|
|
1802
1802
|
Tries to find checkpoint for a job with specific hash and optionally partial
|
|
1803
1803
|
"""
|
|
@@ -1810,7 +1810,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1810
1810
|
return None
|
|
1811
1811
|
return self.checkpoint_class.parse(*rows[0])
|
|
1812
1812
|
|
|
1813
|
-
def get_last_checkpoint(self, job_id: str, conn=None) ->
|
|
1813
|
+
def get_last_checkpoint(self, job_id: str, conn=None) -> Checkpoint | None:
|
|
1814
1814
|
query = (
|
|
1815
1815
|
self._checkpoints_query()
|
|
1816
1816
|
.where(self._checkpoints.c.job_id == job_id)
|
datachain/data_storage/schema.py
CHANGED
|
@@ -1,12 +1,6 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from collections.abc import Iterable, Iterator, Sequence
|
|
3
|
-
from typing import
|
|
4
|
-
TYPE_CHECKING,
|
|
5
|
-
Any,
|
|
6
|
-
Generic,
|
|
7
|
-
Optional,
|
|
8
|
-
TypeVar,
|
|
9
|
-
)
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Generic, TypeVar
|
|
10
4
|
|
|
11
5
|
import sqlalchemy as sa
|
|
12
6
|
from sqlalchemy.sql import func as f
|
|
@@ -96,11 +90,11 @@ class DirExpansion:
|
|
|
96
90
|
def __init__(self, column: str):
|
|
97
91
|
self.column = column
|
|
98
92
|
|
|
99
|
-
def col_name(self, name: str, column:
|
|
93
|
+
def col_name(self, name: str, column: str | None = None) -> str:
|
|
100
94
|
column = column or self.column
|
|
101
95
|
return col_name(name, column)
|
|
102
96
|
|
|
103
|
-
def c(self, query, name: str, column:
|
|
97
|
+
def c(self, query, name: str, column: str | None = None) -> str:
|
|
104
98
|
return getattr(query.c, self.col_name(name, column=column))
|
|
105
99
|
|
|
106
100
|
def base_select(self, q):
|
|
@@ -161,7 +155,7 @@ class DataTable:
|
|
|
161
155
|
self,
|
|
162
156
|
name: str,
|
|
163
157
|
engine: "DatabaseEngine",
|
|
164
|
-
column_types:
|
|
158
|
+
column_types: dict[str, SQLType] | None = None,
|
|
165
159
|
column: str = "file",
|
|
166
160
|
):
|
|
167
161
|
self.name: str = name
|
|
@@ -172,12 +166,12 @@ class DataTable:
|
|
|
172
166
|
@staticmethod
|
|
173
167
|
def copy_column(
|
|
174
168
|
column: sa.Column,
|
|
175
|
-
primary_key:
|
|
176
|
-
index:
|
|
177
|
-
nullable:
|
|
178
|
-
default:
|
|
179
|
-
server_default:
|
|
180
|
-
unique:
|
|
169
|
+
primary_key: bool | None = None,
|
|
170
|
+
index: bool | None = None,
|
|
171
|
+
nullable: bool | None = None,
|
|
172
|
+
default: Any | None = None,
|
|
173
|
+
server_default: Any | None = None,
|
|
174
|
+
unique: bool | None = None,
|
|
181
175
|
) -> sa.Column:
|
|
182
176
|
"""
|
|
183
177
|
Copy a sqlalchemy Column object intended for use as a signal column.
|
|
@@ -206,8 +200,8 @@ class DataTable:
|
|
|
206
200
|
def new_table(
|
|
207
201
|
cls,
|
|
208
202
|
name: str,
|
|
209
|
-
columns: Sequence[
|
|
210
|
-
metadata:
|
|
203
|
+
columns: Sequence[sa.Column] = (),
|
|
204
|
+
metadata: sa.MetaData | None = None,
|
|
211
205
|
):
|
|
212
206
|
# copy columns, since reusing the same objects from another table
|
|
213
207
|
# may raise an error
|
|
@@ -218,7 +212,7 @@ class DataTable:
|
|
|
218
212
|
metadata = sa.MetaData()
|
|
219
213
|
return sa.Table(name, metadata, *columns)
|
|
220
214
|
|
|
221
|
-
def get_table(self) ->
|
|
215
|
+
def get_table(self) -> sa.Table:
|
|
222
216
|
table = self.engine.get_table(self.name)
|
|
223
217
|
|
|
224
218
|
column_types = self.column_types | {c.name: c.type for c in self.sys_columns()}
|
|
@@ -233,19 +227,19 @@ class DataTable:
|
|
|
233
227
|
def columns(self) -> "ReadOnlyColumnCollection[str, sa.Column[Any]]":
|
|
234
228
|
return self.table.columns
|
|
235
229
|
|
|
236
|
-
def col_name(self, name: str, column:
|
|
230
|
+
def col_name(self, name: str, column: str | None = None) -> str:
|
|
237
231
|
column = column or self.column
|
|
238
232
|
return col_name(name, column)
|
|
239
233
|
|
|
240
|
-
def without_object(self, column_name: str, column:
|
|
234
|
+
def without_object(self, column_name: str, column: str | None = None) -> str:
|
|
241
235
|
column = column or self.column
|
|
242
236
|
return column_name.removeprefix(f"{column}{DEFAULT_DELIMITER}")
|
|
243
237
|
|
|
244
|
-
def c(self, name: str, column:
|
|
238
|
+
def c(self, name: str, column: str | None = None):
|
|
245
239
|
return getattr(self.columns, self.col_name(name, column=column))
|
|
246
240
|
|
|
247
241
|
@property
|
|
248
|
-
def table(self) ->
|
|
242
|
+
def table(self) -> sa.Table:
|
|
249
243
|
return self.get_table()
|
|
250
244
|
|
|
251
245
|
def apply_conditions(self, query: "Executable") -> "Executable":
|
|
@@ -303,7 +297,7 @@ PARTITION_COLUMN_ID = "partition_id"
|
|
|
303
297
|
partition_col_names = [PARTITION_COLUMN_ID]
|
|
304
298
|
|
|
305
299
|
|
|
306
|
-
def partition_columns() -> Sequence[
|
|
300
|
+
def partition_columns() -> Sequence[sa.Column]:
|
|
307
301
|
return [
|
|
308
302
|
sa.Column(PARTITION_COLUMN_ID, sa.Integer),
|
|
309
303
|
]
|