datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +213 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +35 -26
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +433 -37
- datachain/data_storage/sqlite.py +140 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +128 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +253 -91
- datachain/lib/dc/datasets.py +103 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +2 -1
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/signal_schema.py +8 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +23 -6
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
import sqlite3
|
|
4
4
|
from collections.abc import Iterable, Sequence
|
|
5
5
|
from contextlib import contextmanager
|
|
6
|
-
from functools import wraps
|
|
6
|
+
from functools import cached_property, wraps
|
|
7
7
|
from time import sleep
|
|
8
8
|
from typing import (
|
|
9
9
|
TYPE_CHECKING,
|
|
@@ -15,7 +15,15 @@ from typing import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
import sqlalchemy
|
|
18
|
-
from sqlalchemy import
|
|
18
|
+
from sqlalchemy import (
|
|
19
|
+
Column,
|
|
20
|
+
Integer,
|
|
21
|
+
MetaData,
|
|
22
|
+
Table,
|
|
23
|
+
UniqueConstraint,
|
|
24
|
+
exists,
|
|
25
|
+
select,
|
|
26
|
+
)
|
|
19
27
|
from sqlalchemy.dialects import sqlite
|
|
20
28
|
from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
|
|
21
29
|
from sqlalchemy.sql import func
|
|
@@ -30,7 +38,9 @@ from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
|
|
|
30
38
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
31
39
|
from datachain.data_storage.schema import DefaultSchema
|
|
32
40
|
from datachain.dataset import DatasetRecord, StorageURI
|
|
33
|
-
from datachain.error import DataChainError
|
|
41
|
+
from datachain.error import DataChainError, OutdatedDatabaseSchemaError
|
|
42
|
+
from datachain.namespace import Namespace
|
|
43
|
+
from datachain.project import Project
|
|
34
44
|
from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
|
|
35
45
|
from datachain.sql.sqlite.base import load_usearch_extension
|
|
36
46
|
from datachain.sql.types import SQLType
|
|
@@ -60,6 +70,14 @@ datachain.sql.sqlite.setup()
|
|
|
60
70
|
quote_schema = sqlite_dialect.identifier_preparer.quote_schema
|
|
61
71
|
quote = sqlite_dialect.identifier_preparer.quote
|
|
62
72
|
|
|
73
|
+
# NOTE! This should be manually increased when we change our DB schema in codebase
|
|
74
|
+
SCHEMA_VERSION = 1
|
|
75
|
+
|
|
76
|
+
OUTDATED_SCHEMA_ERROR_MESSAGE = (
|
|
77
|
+
"You have an old version of the database schema. Please refer to the documentation"
|
|
78
|
+
" for more information."
|
|
79
|
+
)
|
|
80
|
+
|
|
63
81
|
|
|
64
82
|
def _get_in_memory_uri():
|
|
65
83
|
return "file::memory:?cache=shared"
|
|
@@ -303,6 +321,11 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
303
321
|
)
|
|
304
322
|
return bool(next(self.execute(query))[0])
|
|
305
323
|
|
|
324
|
+
@property
|
|
325
|
+
def table_names(self) -> list[str]:
|
|
326
|
+
query = "SELECT name FROM sqlite_master WHERE type='table';"
|
|
327
|
+
return [r[0] for r in self.execute_str(query).fetchall()]
|
|
328
|
+
|
|
306
329
|
def create_table(self, table: "Table", if_not_exists: bool = True) -> None:
|
|
307
330
|
self.execute(CreateTable(table, if_not_exists=if_not_exists))
|
|
308
331
|
|
|
@@ -321,6 +344,8 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
321
344
|
This is currently used for the local cli.
|
|
322
345
|
"""
|
|
323
346
|
|
|
347
|
+
META_TABLE = "meta"
|
|
348
|
+
|
|
324
349
|
db: "SQLiteDatabaseEngine"
|
|
325
350
|
|
|
326
351
|
def __init__(
|
|
@@ -342,7 +367,11 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
342
367
|
|
|
343
368
|
self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
|
|
344
369
|
|
|
370
|
+
self._init_meta_table()
|
|
371
|
+
self._init_meta_schema_value()
|
|
372
|
+
self._check_schema_version()
|
|
345
373
|
self._init_tables()
|
|
374
|
+
self._init_namespaces_projects()
|
|
346
375
|
|
|
347
376
|
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
348
377
|
"""Close connection upon exit from context manager."""
|
|
@@ -383,8 +412,44 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
383
412
|
(db_class, db_args, db_kwargs) = db_clone_params
|
|
384
413
|
return cls(uri=uri, db=db_class(*db_args, **db_kwargs))
|
|
385
414
|
|
|
415
|
+
@cached_property
|
|
416
|
+
def _meta(self) -> Table:
|
|
417
|
+
return Table(self.META_TABLE, self.db.metadata, *self._meta_columns())
|
|
418
|
+
|
|
419
|
+
def _meta_select(self, *columns) -> "Select":
|
|
420
|
+
if not columns:
|
|
421
|
+
return self._meta.select()
|
|
422
|
+
return select(*columns)
|
|
423
|
+
|
|
424
|
+
def _meta_insert(self) -> "Insert":
|
|
425
|
+
return sqlite.insert(self._meta)
|
|
426
|
+
|
|
427
|
+
def _init_meta_table(self) -> None:
|
|
428
|
+
"""Initializes meta table"""
|
|
429
|
+
# NOTE! needs to be called before _init_tables()
|
|
430
|
+
table_names = self.db.table_names
|
|
431
|
+
if table_names and self.META_TABLE not in table_names:
|
|
432
|
+
# this will happen on first run
|
|
433
|
+
raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
|
|
434
|
+
|
|
435
|
+
self.db.create_table(self._meta, if_not_exists=True)
|
|
436
|
+
self.default_table_names.append(self._meta.name)
|
|
437
|
+
|
|
438
|
+
def _init_meta_schema_value(self) -> None:
|
|
439
|
+
"""Inserts current schema version value if not present in meta table yet"""
|
|
440
|
+
stmt = (
|
|
441
|
+
self._meta_insert()
|
|
442
|
+
.values(id=1, schema_version=SCHEMA_VERSION)
|
|
443
|
+
.on_conflict_do_nothing(index_elements=["id"])
|
|
444
|
+
)
|
|
445
|
+
self.db.execute(stmt)
|
|
446
|
+
|
|
386
447
|
def _init_tables(self) -> None:
|
|
387
448
|
"""Initialize tables."""
|
|
449
|
+
self.db.create_table(self._namespaces, if_not_exists=True)
|
|
450
|
+
self.default_table_names.append(self._namespaces.name)
|
|
451
|
+
self.db.create_table(self._projects, if_not_exists=True)
|
|
452
|
+
self.default_table_names.append(self._projects.name)
|
|
388
453
|
self.db.create_table(self._datasets, if_not_exists=True)
|
|
389
454
|
self.default_table_names.append(self._datasets.name)
|
|
390
455
|
self.db.create_table(self._datasets_versions, if_not_exists=True)
|
|
@@ -394,10 +459,56 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
394
459
|
self.db.create_table(self._jobs, if_not_exists=True)
|
|
395
460
|
self.default_table_names.append(self._jobs.name)
|
|
396
461
|
|
|
462
|
+
def _init_namespaces_projects(self) -> None:
|
|
463
|
+
"""
|
|
464
|
+
Creates local namespace and local project connected to it.
|
|
465
|
+
In local environment user cannot explicitly create other namespaces and
|
|
466
|
+
projects and all datasets user creates will be stored in those.
|
|
467
|
+
When pulling dataset from Studio, then other namespaces and projects will
|
|
468
|
+
be created implicitly though, to keep the same fully qualified name with
|
|
469
|
+
Studio dataset.
|
|
470
|
+
"""
|
|
471
|
+
system_namespace = self.create_namespace(
|
|
472
|
+
Namespace.system(), "System namespace", validate=False
|
|
473
|
+
)
|
|
474
|
+
self.create_project(
|
|
475
|
+
system_namespace.name, Project.listing(), "Listing project", validate=False
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
def _check_schema_version(self) -> None:
|
|
479
|
+
"""
|
|
480
|
+
Checks if current DB schema is up to date with latest DB model and schema
|
|
481
|
+
version. If not, OutdatedDatabaseSchemaError is raised.
|
|
482
|
+
"""
|
|
483
|
+
schema_version = next(self.db.execute(self._meta_select()))[1]
|
|
484
|
+
if schema_version < SCHEMA_VERSION:
|
|
485
|
+
raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
|
|
486
|
+
|
|
487
|
+
#
|
|
488
|
+
# Dataset dependencies
|
|
489
|
+
#
|
|
490
|
+
@classmethod
|
|
491
|
+
def _meta_columns(cls) -> list["SchemaItem"]:
|
|
492
|
+
return [
|
|
493
|
+
Column("id", Integer, primary_key=True),
|
|
494
|
+
Column("schema_version", Integer, default=SCHEMA_VERSION),
|
|
495
|
+
]
|
|
496
|
+
|
|
397
497
|
@classmethod
|
|
398
498
|
def _datasets_columns(cls) -> list["SchemaItem"]:
|
|
399
499
|
"""Datasets table columns."""
|
|
400
|
-
return [*super()._datasets_columns(), UniqueConstraint("name")]
|
|
500
|
+
return [*super()._datasets_columns(), UniqueConstraint("project_id", "name")]
|
|
501
|
+
|
|
502
|
+
@classmethod
|
|
503
|
+
def _namespaces_columns(cls) -> list["SchemaItem"]:
|
|
504
|
+
"""Datasets table columns."""
|
|
505
|
+
return [*super()._namespaces_columns(), UniqueConstraint("name")]
|
|
506
|
+
|
|
507
|
+
def _namespaces_insert(self) -> "Insert":
|
|
508
|
+
return sqlite.insert(self._namespaces)
|
|
509
|
+
|
|
510
|
+
def _projects_insert(self) -> "Insert":
|
|
511
|
+
return sqlite.insert(self._projects)
|
|
401
512
|
|
|
402
513
|
def _datasets_insert(self) -> "Insert":
|
|
403
514
|
return sqlite.insert(self._datasets)
|
|
@@ -414,6 +525,8 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
414
525
|
|
|
415
526
|
def _dataset_dependencies_select_columns(self) -> list["SchemaItem"]:
|
|
416
527
|
return [
|
|
528
|
+
self._namespaces.c.name,
|
|
529
|
+
self._projects.c.name,
|
|
417
530
|
self._datasets_dependencies.c.id,
|
|
418
531
|
self._datasets_dependencies.c.dataset_id,
|
|
419
532
|
self._datasets_dependencies.c.dataset_version_id,
|
|
@@ -429,6 +542,26 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
429
542
|
def _jobs_insert(self) -> "Insert":
|
|
430
543
|
return sqlite.insert(self._jobs)
|
|
431
544
|
|
|
545
|
+
@property
|
|
546
|
+
def is_studio(self) -> bool:
|
|
547
|
+
return False
|
|
548
|
+
|
|
549
|
+
#
|
|
550
|
+
# Namespaces
|
|
551
|
+
#
|
|
552
|
+
|
|
553
|
+
@property
|
|
554
|
+
def default_namespace_name(self):
|
|
555
|
+
return Namespace.default()
|
|
556
|
+
|
|
557
|
+
#
|
|
558
|
+
# Projects
|
|
559
|
+
#
|
|
560
|
+
|
|
561
|
+
@property
|
|
562
|
+
def default_project_name(self):
|
|
563
|
+
return Project.default()
|
|
564
|
+
|
|
432
565
|
|
|
433
566
|
class SQLiteWarehouse(AbstractWarehouse):
|
|
434
567
|
"""
|
|
@@ -534,16 +667,16 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
534
667
|
) -> None:
|
|
535
668
|
dst_empty = False
|
|
536
669
|
|
|
537
|
-
if not self.db.has_table(self.dataset_table_name(src
|
|
670
|
+
if not self.db.has_table(self.dataset_table_name(src, src_version)):
|
|
538
671
|
# source table doesn't exist, nothing to do
|
|
539
672
|
return
|
|
540
673
|
|
|
541
674
|
src_dr = self.dataset_rows(src, src_version).table
|
|
542
675
|
|
|
543
|
-
if not self.db.has_table(self.dataset_table_name(dst
|
|
676
|
+
if not self.db.has_table(self.dataset_table_name(dst, dst_version)):
|
|
544
677
|
# destination table doesn't exist, create it
|
|
545
678
|
self.create_dataset_rows_table(
|
|
546
|
-
self.dataset_table_name(dst
|
|
679
|
+
self.dataset_table_name(dst, dst_version),
|
|
547
680
|
columns=src_dr.columns,
|
|
548
681
|
)
|
|
549
682
|
dst_empty = True
|
|
@@ -182,7 +182,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
182
182
|
):
|
|
183
183
|
version = version or dataset.latest_version
|
|
184
184
|
|
|
185
|
-
table_name = self.dataset_table_name(dataset
|
|
185
|
+
table_name = self.dataset_table_name(dataset, version)
|
|
186
186
|
return self.schema.dataset_row_cls(
|
|
187
187
|
table_name,
|
|
188
188
|
self.db,
|
|
@@ -254,12 +254,24 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
254
254
|
name = parsed.path if parsed.scheme == "file" else parsed.netloc
|
|
255
255
|
return parsed.scheme, name
|
|
256
256
|
|
|
257
|
-
def dataset_table_name(self,
|
|
257
|
+
def dataset_table_name(self, dataset: DatasetRecord, version: str) -> str:
|
|
258
|
+
return self._construct_dataset_table_name(
|
|
259
|
+
dataset.project.namespace.name,
|
|
260
|
+
dataset.project.name,
|
|
261
|
+
dataset.name,
|
|
262
|
+
version,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def _construct_dataset_table_name(
|
|
266
|
+
self, namespace: str, project: str, dataset_name: str, version: str
|
|
267
|
+
) -> str:
|
|
258
268
|
prefix = self.DATASET_TABLE_PREFIX
|
|
259
269
|
if Client.is_data_source_uri(dataset_name):
|
|
260
270
|
# for datasets that are created for bucket listing we use different prefix
|
|
261
271
|
prefix = self.DATASET_SOURCE_TABLE_PREFIX
|
|
262
|
-
return
|
|
272
|
+
return (
|
|
273
|
+
f"{prefix}{namespace}_{project}_{dataset_name}_{version.replace('.', '_')}"
|
|
274
|
+
)
|
|
263
275
|
|
|
264
276
|
def temp_table_name(self) -> str:
|
|
265
277
|
return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
|
|
@@ -287,7 +299,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
287
299
|
if_exists: bool = True,
|
|
288
300
|
) -> None:
|
|
289
301
|
"""Drops a dataset rows table for the given dataset name."""
|
|
290
|
-
table_name = self.dataset_table_name(dataset
|
|
302
|
+
table_name = self.dataset_table_name(dataset, version)
|
|
291
303
|
table = sa.Table(table_name, self.db.metadata)
|
|
292
304
|
self.db.drop_table(table, if_exists=if_exists)
|
|
293
305
|
|
|
@@ -344,13 +356,20 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
344
356
|
|
|
345
357
|
def rename_dataset_table(
|
|
346
358
|
self,
|
|
359
|
+
dataset: DatasetRecord,
|
|
347
360
|
old_name: str,
|
|
348
361
|
new_name: str,
|
|
349
362
|
old_version: str,
|
|
350
363
|
new_version: str,
|
|
351
364
|
) -> None:
|
|
352
|
-
|
|
353
|
-
|
|
365
|
+
namespace = dataset.project.namespace.name
|
|
366
|
+
project = dataset.project.name
|
|
367
|
+
old_ds_table_name = self._construct_dataset_table_name(
|
|
368
|
+
namespace, project, old_name, old_version
|
|
369
|
+
)
|
|
370
|
+
new_ds_table_name = self._construct_dataset_table_name(
|
|
371
|
+
namespace, project, new_name, new_version
|
|
372
|
+
)
|
|
354
373
|
|
|
355
374
|
self.db.rename_table(old_ds_table_name, new_ds_table_name)
|
|
356
375
|
|
|
@@ -368,7 +387,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
368
387
|
"""
|
|
369
388
|
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
370
389
|
"""
|
|
371
|
-
if not (self.db.has_table(self.dataset_table_name(dataset
|
|
390
|
+
if not (self.db.has_table(self.dataset_table_name(dataset, version))):
|
|
372
391
|
return None, None
|
|
373
392
|
|
|
374
393
|
file_signals = list(
|
datachain/dataset.py
CHANGED
|
@@ -13,7 +13,9 @@ from typing import (
|
|
|
13
13
|
from urllib.parse import urlparse
|
|
14
14
|
|
|
15
15
|
from datachain import semver
|
|
16
|
-
from datachain.error import DatasetVersionNotFoundError
|
|
16
|
+
from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
|
|
17
|
+
from datachain.namespace import Namespace
|
|
18
|
+
from datachain.project import Project
|
|
17
19
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
18
20
|
|
|
19
21
|
T = TypeVar("T", bound="DatasetRecord")
|
|
@@ -27,6 +29,8 @@ QUERY_DATASET_PREFIX = "ds_query_"
|
|
|
27
29
|
LISTING_PREFIX = "lst__"
|
|
28
30
|
|
|
29
31
|
DEFAULT_DATASET_VERSION = "1.0.0"
|
|
32
|
+
DATASET_NAME_RESERVED_CHARS = ["."]
|
|
33
|
+
DATASET_NAME_REPLACEMENT_CHAR = "_"
|
|
30
34
|
|
|
31
35
|
|
|
32
36
|
# StorageURI represents a normalised URI to a valid storage location (full bucket or
|
|
@@ -57,20 +61,37 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
|
|
|
57
61
|
return name, s[1]
|
|
58
62
|
|
|
59
63
|
|
|
60
|
-
def create_dataset_uri(
|
|
64
|
+
def create_dataset_uri(
|
|
65
|
+
name: str, namespace: str, project: str, version: Optional[str] = None
|
|
66
|
+
) -> str:
|
|
61
67
|
"""
|
|
62
|
-
Creates a dataset uri based on dataset name and optionally
|
|
68
|
+
Creates a dataset uri based on namespace, project, dataset name and optionally
|
|
69
|
+
version.
|
|
63
70
|
Example:
|
|
64
|
-
Input: zalando, 3.0.1
|
|
65
|
-
Output: ds//zalando@v3.0.1
|
|
71
|
+
Input: dev, clothes, zalando, 3.0.1
|
|
72
|
+
Output: ds//dev.clothes.zalando@v3.0.1
|
|
66
73
|
"""
|
|
67
|
-
uri = f"{DATASET_PREFIX}{name}"
|
|
74
|
+
uri = f"{DATASET_PREFIX}{namespace}.{project}.{name}"
|
|
68
75
|
if version:
|
|
69
76
|
uri += f"@v{version}"
|
|
70
77
|
|
|
71
78
|
return uri
|
|
72
79
|
|
|
73
80
|
|
|
81
|
+
def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
|
|
82
|
+
"""Parses dataset name and returns namespace, project and name"""
|
|
83
|
+
if not name:
|
|
84
|
+
raise InvalidDatasetNameError("Name must be defined to parse it")
|
|
85
|
+
split = name.split(".")
|
|
86
|
+
if len(split) > 3:
|
|
87
|
+
raise InvalidDatasetNameError(f"Invalid dataset name {name}")
|
|
88
|
+
name = split[-1]
|
|
89
|
+
project_name = split[-2] if len(split) > 1 else None
|
|
90
|
+
namespace_name = split[-3] if len(split) > 2 else None
|
|
91
|
+
|
|
92
|
+
return namespace_name, project_name, name
|
|
93
|
+
|
|
94
|
+
|
|
74
95
|
class DatasetDependencyType:
|
|
75
96
|
DATASET = "dataset"
|
|
76
97
|
STORAGE = "storage"
|
|
@@ -78,8 +99,12 @@ class DatasetDependencyType:
|
|
|
78
99
|
|
|
79
100
|
@dataclass
|
|
80
101
|
class DatasetDependency:
|
|
102
|
+
# TODO put `DatasetRecord` instead of name + version which will
|
|
103
|
+
# simplify codebase in various places
|
|
81
104
|
id: int
|
|
82
105
|
type: str
|
|
106
|
+
namespace: str
|
|
107
|
+
project: str
|
|
83
108
|
name: str
|
|
84
109
|
version: str
|
|
85
110
|
created_at: datetime
|
|
@@ -100,6 +125,8 @@ class DatasetDependency:
|
|
|
100
125
|
@classmethod
|
|
101
126
|
def parse(
|
|
102
127
|
cls: builtins.type[DD],
|
|
128
|
+
namespace_name: str,
|
|
129
|
+
project_name: str,
|
|
103
130
|
id: int,
|
|
104
131
|
dataset_id: Optional[int],
|
|
105
132
|
dataset_version_id: Optional[int],
|
|
@@ -121,6 +148,8 @@ class DatasetDependency:
|
|
|
121
148
|
if is_listing_dataset(dataset_name)
|
|
122
149
|
else DatasetDependencyType.DATASET
|
|
123
150
|
),
|
|
151
|
+
namespace_name,
|
|
152
|
+
project_name,
|
|
124
153
|
dataset_name,
|
|
125
154
|
(
|
|
126
155
|
dataset_version # type: ignore[arg-type]
|
|
@@ -335,6 +364,7 @@ class DatasetListVersion:
|
|
|
335
364
|
class DatasetRecord:
|
|
336
365
|
id: int
|
|
337
366
|
name: str
|
|
367
|
+
project: Project
|
|
338
368
|
description: Optional[str]
|
|
339
369
|
attrs: list[str]
|
|
340
370
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
@@ -349,6 +379,9 @@ class DatasetRecord:
|
|
|
349
379
|
sources: str = ""
|
|
350
380
|
query_script: str = ""
|
|
351
381
|
|
|
382
|
+
def __hash__(self):
|
|
383
|
+
return hash(f"{self.id}")
|
|
384
|
+
|
|
352
385
|
@staticmethod
|
|
353
386
|
def parse_schema(
|
|
354
387
|
ct: dict[str, Any],
|
|
@@ -358,10 +391,31 @@ class DatasetRecord:
|
|
|
358
391
|
for c_name, c_type in ct.items()
|
|
359
392
|
}
|
|
360
393
|
|
|
394
|
+
@staticmethod
|
|
395
|
+
def validate_name(name: str) -> None:
|
|
396
|
+
"""Throws exception if name has reserved characters"""
|
|
397
|
+
for c in DATASET_NAME_RESERVED_CHARS:
|
|
398
|
+
if c in name:
|
|
399
|
+
raise InvalidDatasetNameError(
|
|
400
|
+
f"Character {c} is reserved and not allowed in dataset name"
|
|
401
|
+
)
|
|
402
|
+
|
|
361
403
|
@classmethod
|
|
362
404
|
def parse( # noqa: PLR0913
|
|
363
405
|
cls,
|
|
364
|
-
|
|
406
|
+
namespace_id: int,
|
|
407
|
+
namespace_uuid: str,
|
|
408
|
+
namespace_name: str,
|
|
409
|
+
namespace_description: Optional[str],
|
|
410
|
+
namespace_created_at: datetime,
|
|
411
|
+
project_id: int,
|
|
412
|
+
project_uuid: str,
|
|
413
|
+
project_name: str,
|
|
414
|
+
project_description: Optional[str],
|
|
415
|
+
project_created_at: datetime,
|
|
416
|
+
project_namespace_id: int,
|
|
417
|
+
dataset_id: int,
|
|
418
|
+
dataset_project_id: int,
|
|
365
419
|
name: str,
|
|
366
420
|
description: Optional[str],
|
|
367
421
|
attrs: str,
|
|
@@ -400,6 +454,23 @@ class DatasetRecord:
|
|
|
400
454
|
json.loads(version_schema) if version_schema else {}
|
|
401
455
|
)
|
|
402
456
|
|
|
457
|
+
namespace = Namespace(
|
|
458
|
+
namespace_id,
|
|
459
|
+
namespace_uuid,
|
|
460
|
+
namespace_name,
|
|
461
|
+
namespace_description,
|
|
462
|
+
namespace_created_at,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
project = Project(
|
|
466
|
+
project_id,
|
|
467
|
+
project_uuid,
|
|
468
|
+
project_name,
|
|
469
|
+
project_description,
|
|
470
|
+
project_created_at,
|
|
471
|
+
namespace,
|
|
472
|
+
)
|
|
473
|
+
|
|
403
474
|
dataset_version = DatasetVersion.parse(
|
|
404
475
|
version_id,
|
|
405
476
|
version_uuid,
|
|
@@ -422,8 +493,9 @@ class DatasetRecord:
|
|
|
422
493
|
)
|
|
423
494
|
|
|
424
495
|
return cls(
|
|
425
|
-
|
|
496
|
+
dataset_id,
|
|
426
497
|
name,
|
|
498
|
+
project,
|
|
427
499
|
description,
|
|
428
500
|
attrs_lst,
|
|
429
501
|
cls.parse_schema(schema_dct), # type: ignore[arg-type]
|
|
@@ -448,6 +520,10 @@ class DatasetRecord:
|
|
|
448
520
|
for c_name, c_type in self.schema.items()
|
|
449
521
|
}
|
|
450
522
|
|
|
523
|
+
@property
|
|
524
|
+
def full_name(self) -> str:
|
|
525
|
+
return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
|
|
526
|
+
|
|
451
527
|
def get_schema(self, version: str) -> dict[str, Union[SQLType, type[SQLType]]]:
|
|
452
528
|
return self.get_version(version).schema if version else self.schema
|
|
453
529
|
|
|
@@ -527,7 +603,10 @@ class DatasetRecord:
|
|
|
527
603
|
Dataset uri example: ds://dogs@v3.0.1
|
|
528
604
|
"""
|
|
529
605
|
identifier = self.identifier(version)
|
|
530
|
-
return
|
|
606
|
+
return (
|
|
607
|
+
f"{DATASET_PREFIX}{self.project.namespace.name}"
|
|
608
|
+
f".{self.project.name}.{identifier}"
|
|
609
|
+
)
|
|
531
610
|
|
|
532
611
|
@property
|
|
533
612
|
def next_version_major(self) -> str:
|
|
@@ -592,15 +671,17 @@ class DatasetRecord:
|
|
|
592
671
|
|
|
593
672
|
@classmethod
|
|
594
673
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
|
|
674
|
+
project = Project.from_dict(d.pop("project"))
|
|
595
675
|
versions = [DatasetVersion.from_dict(v) for v in d.pop("versions", [])]
|
|
596
676
|
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
597
|
-
return cls(**kwargs, versions=versions)
|
|
677
|
+
return cls(**kwargs, versions=versions, project=project)
|
|
598
678
|
|
|
599
679
|
|
|
600
680
|
@dataclass
|
|
601
681
|
class DatasetListRecord:
|
|
602
682
|
id: int
|
|
603
683
|
name: str
|
|
684
|
+
project: Project
|
|
604
685
|
description: Optional[str]
|
|
605
686
|
attrs: list[str]
|
|
606
687
|
versions: list[DatasetListVersion]
|
|
@@ -609,7 +690,18 @@ class DatasetListRecord:
|
|
|
609
690
|
@classmethod
|
|
610
691
|
def parse( # noqa: PLR0913
|
|
611
692
|
cls,
|
|
612
|
-
|
|
693
|
+
namespace_id: int,
|
|
694
|
+
namespace_uuid: str,
|
|
695
|
+
namespace_name: str,
|
|
696
|
+
namespace_description: Optional[str],
|
|
697
|
+
namespace_created_at: datetime,
|
|
698
|
+
project_id: int,
|
|
699
|
+
project_uuid: str,
|
|
700
|
+
project_name: str,
|
|
701
|
+
project_description: Optional[str],
|
|
702
|
+
project_created_at: datetime,
|
|
703
|
+
project_namespace_id: int,
|
|
704
|
+
dataset_id: int,
|
|
613
705
|
name: str,
|
|
614
706
|
description: Optional[str],
|
|
615
707
|
attrs: str,
|
|
@@ -630,6 +722,23 @@ class DatasetListRecord:
|
|
|
630
722
|
) -> "DatasetListRecord":
|
|
631
723
|
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
632
724
|
|
|
725
|
+
namespace = Namespace(
|
|
726
|
+
namespace_id,
|
|
727
|
+
namespace_uuid,
|
|
728
|
+
namespace_name,
|
|
729
|
+
namespace_description,
|
|
730
|
+
namespace_created_at,
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
project = Project(
|
|
734
|
+
project_id,
|
|
735
|
+
project_uuid,
|
|
736
|
+
project_name,
|
|
737
|
+
project_description,
|
|
738
|
+
project_created_at,
|
|
739
|
+
namespace,
|
|
740
|
+
)
|
|
741
|
+
|
|
633
742
|
dataset_version = DatasetListVersion.parse(
|
|
634
743
|
version_id,
|
|
635
744
|
version_uuid,
|
|
@@ -647,14 +756,19 @@ class DatasetListRecord:
|
|
|
647
756
|
)
|
|
648
757
|
|
|
649
758
|
return cls(
|
|
650
|
-
|
|
759
|
+
dataset_id,
|
|
651
760
|
name,
|
|
761
|
+
project,
|
|
652
762
|
description,
|
|
653
763
|
attrs_lst,
|
|
654
764
|
[dataset_version],
|
|
655
765
|
created_at,
|
|
656
766
|
)
|
|
657
767
|
|
|
768
|
+
@property
|
|
769
|
+
def full_name(self) -> str:
|
|
770
|
+
return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
|
|
771
|
+
|
|
658
772
|
def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
|
|
659
773
|
"""Merge versions from another dataset"""
|
|
660
774
|
if other.id != self.id:
|
|
@@ -691,9 +805,11 @@ class DatasetListRecord:
|
|
|
691
805
|
|
|
692
806
|
@classmethod
|
|
693
807
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetListRecord":
|
|
808
|
+
project = Project.from_dict(d.pop("project"))
|
|
694
809
|
versions = [DatasetListVersion.parse(**v) for v in d.get("versions", [])]
|
|
695
810
|
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
696
811
|
kwargs["versions"] = versions
|
|
812
|
+
kwargs["project"] = project
|
|
697
813
|
return cls(**kwargs)
|
|
698
814
|
|
|
699
815
|
|
datachain/delta.py
CHANGED
|
@@ -56,11 +56,13 @@ def _get_delta_chain(
|
|
|
56
56
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
57
57
|
) -> "DataChain":
|
|
58
58
|
"""Get delta chain for processing changes between versions."""
|
|
59
|
-
source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
|
|
60
|
-
source_dc_latest = datachain.read_dataset(
|
|
59
|
+
source_dc = datachain.read_dataset(source_ds_name, version=source_ds_version)
|
|
60
|
+
source_dc_latest = datachain.read_dataset(
|
|
61
|
+
source_ds_name, version=source_ds_latest_version
|
|
62
|
+
)
|
|
61
63
|
|
|
62
64
|
# Calculate diff between source versions
|
|
63
|
-
return source_dc_latest.
|
|
65
|
+
return source_dc_latest.diff(source_dc, on=on, compare=compare, deleted=False)
|
|
64
66
|
|
|
65
67
|
|
|
66
68
|
def _get_retry_chain(
|
|
@@ -79,8 +81,10 @@ def _get_retry_chain(
|
|
|
79
81
|
retry_chain = None
|
|
80
82
|
|
|
81
83
|
# Read the latest version of the result dataset for retry logic
|
|
82
|
-
result_dataset = datachain.read_dataset(name, latest_version)
|
|
83
|
-
source_dc_latest = datachain.read_dataset(
|
|
84
|
+
result_dataset = datachain.read_dataset(name, version=latest_version)
|
|
85
|
+
source_dc_latest = datachain.read_dataset(
|
|
86
|
+
source_ds_name, version=source_ds_latest_version
|
|
87
|
+
)
|
|
84
88
|
|
|
85
89
|
# Handle error records if delta_retry is a string (column name)
|
|
86
90
|
if isinstance(delta_retry, str):
|
|
@@ -232,8 +236,8 @@ def delta_retry_update(
|
|
|
232
236
|
if processing_chain is None or (processing_chain and processing_chain.empty):
|
|
233
237
|
return None, None, False
|
|
234
238
|
|
|
235
|
-
latest_dataset = datachain.read_dataset(name, latest_version)
|
|
236
|
-
compared_chain = latest_dataset.
|
|
239
|
+
latest_dataset = datachain.read_dataset(name, version=latest_version)
|
|
240
|
+
compared_chain = latest_dataset.diff(
|
|
237
241
|
processing_chain,
|
|
238
242
|
on=right_on or on,
|
|
239
243
|
added=True,
|
datachain/error.py
CHANGED
|
@@ -2,10 +2,42 @@ class DataChainError(RuntimeError):
|
|
|
2
2
|
pass
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
class InvalidDatasetNameError(RuntimeError):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class InvalidNamespaceNameError(RuntimeError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InvalidProjectNameError(RuntimeError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
5
17
|
class NotFoundError(Exception):
|
|
6
18
|
pass
|
|
7
19
|
|
|
8
20
|
|
|
21
|
+
class NamespaceNotFoundError(NotFoundError):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NotAllowedError(Exception):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class NamespaceCreateNotAllowedError(NotAllowedError):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ProjectCreateNotAllowedError(NotAllowedError):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ProjectNotFoundError(NotFoundError):
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
9
41
|
class DatasetNotFoundError(NotFoundError):
|
|
10
42
|
pass
|
|
11
43
|
|
|
@@ -53,3 +85,7 @@ class ClientError(RuntimeError):
|
|
|
53
85
|
|
|
54
86
|
class TableMissingError(DataChainError):
|
|
55
87
|
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class OutdatedDatabaseSchemaError(DataChainError):
|
|
91
|
+
pass
|