datachain 0.19.2__py3-none-any.whl → 0.20.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +3 -0
- datachain/catalog/catalog.py +180 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +43 -28
- datachain/cli/parser/__init__.py +1 -35
- datachain/cli/parser/job.py +3 -3
- datachain/data_storage/metastore.py +390 -37
- datachain/data_storage/sqlite.py +139 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +125 -12
- datachain/delta.py +9 -5
- datachain/error.py +36 -0
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +86 -7
- datachain/lib/dc/datasets.py +62 -12
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +1 -0
- datachain/lib/dc/storage.py +14 -2
- datachain/lib/listing.py +3 -1
- datachain/lib/namespaces.py +73 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/settings.py +10 -0
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +63 -28
- datachain/studio.py +26 -9
- {datachain-0.19.2.dist-info → datachain-0.20.1.dist-info}/METADATA +2 -2
- {datachain-0.19.2.dist-info → datachain-0.20.1.dist-info}/RECORD +35 -31
- {datachain-0.19.2.dist-info → datachain-0.20.1.dist-info}/WHEEL +0 -0
- {datachain-0.19.2.dist-info → datachain-0.20.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.19.2.dist-info → datachain-0.20.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.19.2.dist-info → datachain-0.20.1.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
import sqlite3
|
|
4
4
|
from collections.abc import Iterable, Sequence
|
|
5
5
|
from contextlib import contextmanager
|
|
6
|
-
from functools import wraps
|
|
6
|
+
from functools import cached_property, wraps
|
|
7
7
|
from time import sleep
|
|
8
8
|
from typing import (
|
|
9
9
|
TYPE_CHECKING,
|
|
@@ -15,7 +15,15 @@ from typing import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
import sqlalchemy
|
|
18
|
-
from sqlalchemy import
|
|
18
|
+
from sqlalchemy import (
|
|
19
|
+
Column,
|
|
20
|
+
Integer,
|
|
21
|
+
MetaData,
|
|
22
|
+
Table,
|
|
23
|
+
UniqueConstraint,
|
|
24
|
+
exists,
|
|
25
|
+
select,
|
|
26
|
+
)
|
|
19
27
|
from sqlalchemy.dialects import sqlite
|
|
20
28
|
from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
|
|
21
29
|
from sqlalchemy.sql import func
|
|
@@ -30,7 +38,9 @@ from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
|
|
|
30
38
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
31
39
|
from datachain.data_storage.schema import DefaultSchema
|
|
32
40
|
from datachain.dataset import DatasetRecord, StorageURI
|
|
33
|
-
from datachain.error import DataChainError
|
|
41
|
+
from datachain.error import DataChainError, OutdatedDatabaseSchemaError
|
|
42
|
+
from datachain.namespace import Namespace
|
|
43
|
+
from datachain.project import Project
|
|
34
44
|
from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
|
|
35
45
|
from datachain.sql.sqlite.base import load_usearch_extension
|
|
36
46
|
from datachain.sql.types import SQLType
|
|
@@ -60,6 +70,14 @@ datachain.sql.sqlite.setup()
|
|
|
60
70
|
quote_schema = sqlite_dialect.identifier_preparer.quote_schema
|
|
61
71
|
quote = sqlite_dialect.identifier_preparer.quote
|
|
62
72
|
|
|
73
|
+
# NOTE! This should be manually increased when we change our DB schema in codebase
|
|
74
|
+
SCHEMA_VERSION = 1
|
|
75
|
+
|
|
76
|
+
OUTDATED_SCHEMA_ERROR_MESSAGE = (
|
|
77
|
+
"You have an old version of the database schema. Please refer to the documentation"
|
|
78
|
+
" for more information."
|
|
79
|
+
)
|
|
80
|
+
|
|
63
81
|
|
|
64
82
|
def _get_in_memory_uri():
|
|
65
83
|
return "file::memory:?cache=shared"
|
|
@@ -303,6 +321,11 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
303
321
|
)
|
|
304
322
|
return bool(next(self.execute(query))[0])
|
|
305
323
|
|
|
324
|
+
@property
|
|
325
|
+
def table_names(self) -> list[str]:
|
|
326
|
+
query = "SELECT name FROM sqlite_master WHERE type='table';"
|
|
327
|
+
return [r[0] for r in self.execute_str(query).fetchall()]
|
|
328
|
+
|
|
306
329
|
def create_table(self, table: "Table", if_not_exists: bool = True) -> None:
|
|
307
330
|
self.execute(CreateTable(table, if_not_exists=if_not_exists))
|
|
308
331
|
|
|
@@ -321,6 +344,8 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
321
344
|
This is currently used for the local cli.
|
|
322
345
|
"""
|
|
323
346
|
|
|
347
|
+
META_TABLE = "meta"
|
|
348
|
+
|
|
324
349
|
db: "SQLiteDatabaseEngine"
|
|
325
350
|
|
|
326
351
|
def __init__(
|
|
@@ -342,7 +367,11 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
342
367
|
|
|
343
368
|
self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
|
|
344
369
|
|
|
370
|
+
self._init_meta_table()
|
|
371
|
+
self._init_meta_schema_value()
|
|
372
|
+
self._check_schema_version()
|
|
345
373
|
self._init_tables()
|
|
374
|
+
self._init_namespaces_projects()
|
|
346
375
|
|
|
347
376
|
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
348
377
|
"""Close connection upon exit from context manager."""
|
|
@@ -383,8 +412,44 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
383
412
|
(db_class, db_args, db_kwargs) = db_clone_params
|
|
384
413
|
return cls(uri=uri, db=db_class(*db_args, **db_kwargs))
|
|
385
414
|
|
|
415
|
+
@cached_property
|
|
416
|
+
def _meta(self) -> Table:
|
|
417
|
+
return Table(self.META_TABLE, self.db.metadata, *self._meta_columns())
|
|
418
|
+
|
|
419
|
+
def _meta_select(self, *columns) -> "Select":
|
|
420
|
+
if not columns:
|
|
421
|
+
return self._meta.select()
|
|
422
|
+
return select(*columns)
|
|
423
|
+
|
|
424
|
+
def _meta_insert(self) -> "Insert":
|
|
425
|
+
return sqlite.insert(self._meta)
|
|
426
|
+
|
|
427
|
+
def _init_meta_table(self) -> None:
|
|
428
|
+
"""Initializes meta table"""
|
|
429
|
+
# NOTE! needs to be called before _init_tables()
|
|
430
|
+
table_names = self.db.table_names
|
|
431
|
+
if table_names and self.META_TABLE not in table_names:
|
|
432
|
+
# this will happen on first run
|
|
433
|
+
raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
|
|
434
|
+
|
|
435
|
+
self.db.create_table(self._meta, if_not_exists=True)
|
|
436
|
+
self.default_table_names.append(self._meta.name)
|
|
437
|
+
|
|
438
|
+
def _init_meta_schema_value(self) -> None:
|
|
439
|
+
"""Inserts current schema version value if not present in meta table yet"""
|
|
440
|
+
stmt = (
|
|
441
|
+
self._meta_insert()
|
|
442
|
+
.values(id=1, schema_version=SCHEMA_VERSION)
|
|
443
|
+
.on_conflict_do_nothing(index_elements=["id"])
|
|
444
|
+
)
|
|
445
|
+
self.db.execute(stmt)
|
|
446
|
+
|
|
386
447
|
def _init_tables(self) -> None:
|
|
387
448
|
"""Initialize tables."""
|
|
449
|
+
self.db.create_table(self._namespaces, if_not_exists=True)
|
|
450
|
+
self.default_table_names.append(self._namespaces.name)
|
|
451
|
+
self.db.create_table(self._projects, if_not_exists=True)
|
|
452
|
+
self.default_table_names.append(self._projects.name)
|
|
388
453
|
self.db.create_table(self._datasets, if_not_exists=True)
|
|
389
454
|
self.default_table_names.append(self._datasets.name)
|
|
390
455
|
self.db.create_table(self._datasets_versions, if_not_exists=True)
|
|
@@ -394,10 +459,55 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
394
459
|
self.db.create_table(self._jobs, if_not_exists=True)
|
|
395
460
|
self.default_table_names.append(self._jobs.name)
|
|
396
461
|
|
|
462
|
+
def _init_namespaces_projects(self) -> None:
|
|
463
|
+
"""
|
|
464
|
+
Creates local namespace and local project connected to it.
|
|
465
|
+
In local environment user cannot explicitly create other namespaces and
|
|
466
|
+
projects and all datasets user creates will be stored in those.
|
|
467
|
+
When pulling dataset from Studio, then other namespaces and projects will
|
|
468
|
+
be created implicitly though, to keep the same fully qualified name with
|
|
469
|
+
Studio dataset.
|
|
470
|
+
"""
|
|
471
|
+
system_namespace = self.create_namespace(Namespace.system(), "System namespace")
|
|
472
|
+
self.create_project(Project.listing(), system_namespace.name, "Listing project")
|
|
473
|
+
|
|
474
|
+
local_namespace = self.create_namespace(Namespace.default(), "Local namespace")
|
|
475
|
+
self.create_project(Project.default(), local_namespace.name, "Local project")
|
|
476
|
+
|
|
477
|
+
def _check_schema_version(self) -> None:
|
|
478
|
+
"""
|
|
479
|
+
Checks if current DB schema is up to date with latest DB model and schema
|
|
480
|
+
version. If not, OutdatedDatabaseSchemaError is raised.
|
|
481
|
+
"""
|
|
482
|
+
schema_version = next(self.db.execute(self._meta_select()))[1]
|
|
483
|
+
if schema_version < SCHEMA_VERSION:
|
|
484
|
+
raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
|
|
485
|
+
|
|
486
|
+
#
|
|
487
|
+
# Dataset dependencies
|
|
488
|
+
#
|
|
489
|
+
@classmethod
|
|
490
|
+
def _meta_columns(cls) -> list["SchemaItem"]:
|
|
491
|
+
return [
|
|
492
|
+
Column("id", Integer, primary_key=True),
|
|
493
|
+
Column("schema_version", Integer, default=SCHEMA_VERSION),
|
|
494
|
+
]
|
|
495
|
+
|
|
397
496
|
@classmethod
|
|
398
497
|
def _datasets_columns(cls) -> list["SchemaItem"]:
|
|
399
498
|
"""Datasets table columns."""
|
|
400
|
-
return [*super()._datasets_columns(), UniqueConstraint("name")]
|
|
499
|
+
return [*super()._datasets_columns(), UniqueConstraint("project_id", "name")]
|
|
500
|
+
|
|
501
|
+
@classmethod
|
|
502
|
+
def _namespaces_columns(cls) -> list["SchemaItem"]:
|
|
503
|
+
"""Datasets table columns."""
|
|
504
|
+
return [*super()._namespaces_columns(), UniqueConstraint("name")]
|
|
505
|
+
|
|
506
|
+
def _namespaces_insert(self) -> "Insert":
|
|
507
|
+
return sqlite.insert(self._namespaces)
|
|
508
|
+
|
|
509
|
+
def _projects_insert(self) -> "Insert":
|
|
510
|
+
return sqlite.insert(self._projects)
|
|
401
511
|
|
|
402
512
|
def _datasets_insert(self) -> "Insert":
|
|
403
513
|
return sqlite.insert(self._datasets)
|
|
@@ -414,6 +524,8 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
414
524
|
|
|
415
525
|
def _dataset_dependencies_select_columns(self) -> list["SchemaItem"]:
|
|
416
526
|
return [
|
|
527
|
+
self._namespaces.c.name,
|
|
528
|
+
self._projects.c.name,
|
|
417
529
|
self._datasets_dependencies.c.id,
|
|
418
530
|
self._datasets_dependencies.c.dataset_id,
|
|
419
531
|
self._datasets_dependencies.c.dataset_version_id,
|
|
@@ -429,6 +541,26 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
429
541
|
def _jobs_insert(self) -> "Insert":
|
|
430
542
|
return sqlite.insert(self._jobs)
|
|
431
543
|
|
|
544
|
+
@property
|
|
545
|
+
def is_studio(self) -> bool:
|
|
546
|
+
return False
|
|
547
|
+
|
|
548
|
+
#
|
|
549
|
+
# Namespaces
|
|
550
|
+
#
|
|
551
|
+
|
|
552
|
+
@property
|
|
553
|
+
def default_namespace_name(self):
|
|
554
|
+
return Namespace.default()
|
|
555
|
+
|
|
556
|
+
#
|
|
557
|
+
# Projects
|
|
558
|
+
#
|
|
559
|
+
|
|
560
|
+
@property
|
|
561
|
+
def default_project_name(self):
|
|
562
|
+
return Project.default()
|
|
563
|
+
|
|
432
564
|
|
|
433
565
|
class SQLiteWarehouse(AbstractWarehouse):
|
|
434
566
|
"""
|
|
@@ -534,16 +666,16 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
534
666
|
) -> None:
|
|
535
667
|
dst_empty = False
|
|
536
668
|
|
|
537
|
-
if not self.db.has_table(self.dataset_table_name(src
|
|
669
|
+
if not self.db.has_table(self.dataset_table_name(src, src_version)):
|
|
538
670
|
# source table doesn't exist, nothing to do
|
|
539
671
|
return
|
|
540
672
|
|
|
541
673
|
src_dr = self.dataset_rows(src, src_version).table
|
|
542
674
|
|
|
543
|
-
if not self.db.has_table(self.dataset_table_name(dst
|
|
675
|
+
if not self.db.has_table(self.dataset_table_name(dst, dst_version)):
|
|
544
676
|
# destination table doesn't exist, create it
|
|
545
677
|
self.create_dataset_rows_table(
|
|
546
|
-
self.dataset_table_name(dst
|
|
678
|
+
self.dataset_table_name(dst, dst_version),
|
|
547
679
|
columns=src_dr.columns,
|
|
548
680
|
)
|
|
549
681
|
dst_empty = True
|
|
@@ -182,7 +182,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
182
182
|
):
|
|
183
183
|
version = version or dataset.latest_version
|
|
184
184
|
|
|
185
|
-
table_name = self.dataset_table_name(dataset
|
|
185
|
+
table_name = self.dataset_table_name(dataset, version)
|
|
186
186
|
return self.schema.dataset_row_cls(
|
|
187
187
|
table_name,
|
|
188
188
|
self.db,
|
|
@@ -254,12 +254,24 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
254
254
|
name = parsed.path if parsed.scheme == "file" else parsed.netloc
|
|
255
255
|
return parsed.scheme, name
|
|
256
256
|
|
|
257
|
-
def dataset_table_name(self,
|
|
257
|
+
def dataset_table_name(self, dataset: DatasetRecord, version: str) -> str:
|
|
258
|
+
return self._construct_dataset_table_name(
|
|
259
|
+
dataset.project.namespace.name,
|
|
260
|
+
dataset.project.name,
|
|
261
|
+
dataset.name,
|
|
262
|
+
version,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def _construct_dataset_table_name(
|
|
266
|
+
self, namespace: str, project: str, dataset_name: str, version: str
|
|
267
|
+
) -> str:
|
|
258
268
|
prefix = self.DATASET_TABLE_PREFIX
|
|
259
269
|
if Client.is_data_source_uri(dataset_name):
|
|
260
270
|
# for datasets that are created for bucket listing we use different prefix
|
|
261
271
|
prefix = self.DATASET_SOURCE_TABLE_PREFIX
|
|
262
|
-
return
|
|
272
|
+
return (
|
|
273
|
+
f"{prefix}{namespace}_{project}_{dataset_name}_{version.replace('.', '_')}"
|
|
274
|
+
)
|
|
263
275
|
|
|
264
276
|
def temp_table_name(self) -> str:
|
|
265
277
|
return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
|
|
@@ -287,7 +299,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
287
299
|
if_exists: bool = True,
|
|
288
300
|
) -> None:
|
|
289
301
|
"""Drops a dataset rows table for the given dataset name."""
|
|
290
|
-
table_name = self.dataset_table_name(dataset
|
|
302
|
+
table_name = self.dataset_table_name(dataset, version)
|
|
291
303
|
table = sa.Table(table_name, self.db.metadata)
|
|
292
304
|
self.db.drop_table(table, if_exists=if_exists)
|
|
293
305
|
|
|
@@ -344,13 +356,20 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
344
356
|
|
|
345
357
|
def rename_dataset_table(
|
|
346
358
|
self,
|
|
359
|
+
dataset: DatasetRecord,
|
|
347
360
|
old_name: str,
|
|
348
361
|
new_name: str,
|
|
349
362
|
old_version: str,
|
|
350
363
|
new_version: str,
|
|
351
364
|
) -> None:
|
|
352
|
-
|
|
353
|
-
|
|
365
|
+
namespace = dataset.project.namespace.name
|
|
366
|
+
project = dataset.project.name
|
|
367
|
+
old_ds_table_name = self._construct_dataset_table_name(
|
|
368
|
+
namespace, project, old_name, old_version
|
|
369
|
+
)
|
|
370
|
+
new_ds_table_name = self._construct_dataset_table_name(
|
|
371
|
+
namespace, project, new_name, new_version
|
|
372
|
+
)
|
|
354
373
|
|
|
355
374
|
self.db.rename_table(old_ds_table_name, new_ds_table_name)
|
|
356
375
|
|
|
@@ -368,7 +387,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
368
387
|
"""
|
|
369
388
|
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
370
389
|
"""
|
|
371
|
-
if not (self.db.has_table(self.dataset_table_name(dataset
|
|
390
|
+
if not (self.db.has_table(self.dataset_table_name(dataset, version))):
|
|
372
391
|
return None, None
|
|
373
392
|
|
|
374
393
|
file_signals = list(
|
datachain/dataset.py
CHANGED
|
@@ -13,7 +13,9 @@ from typing import (
|
|
|
13
13
|
from urllib.parse import urlparse
|
|
14
14
|
|
|
15
15
|
from datachain import semver
|
|
16
|
-
from datachain.error import DatasetVersionNotFoundError
|
|
16
|
+
from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
|
|
17
|
+
from datachain.namespace import Namespace
|
|
18
|
+
from datachain.project import Project
|
|
17
19
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
18
20
|
|
|
19
21
|
T = TypeVar("T", bound="DatasetRecord")
|
|
@@ -27,6 +29,8 @@ QUERY_DATASET_PREFIX = "ds_query_"
|
|
|
27
29
|
LISTING_PREFIX = "lst__"
|
|
28
30
|
|
|
29
31
|
DEFAULT_DATASET_VERSION = "1.0.0"
|
|
32
|
+
DATASET_NAME_RESERVED_CHARS = ["."]
|
|
33
|
+
DATASET_NAME_REPLACEMENT_CHAR = "_"
|
|
30
34
|
|
|
31
35
|
|
|
32
36
|
# StorageURI represents a normalised URI to a valid storage location (full bucket or
|
|
@@ -57,20 +61,34 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
|
|
|
57
61
|
return name, s[1]
|
|
58
62
|
|
|
59
63
|
|
|
60
|
-
def create_dataset_uri(
|
|
64
|
+
def create_dataset_uri(
|
|
65
|
+
name: str, namespace: str, project: str, version: Optional[str] = None
|
|
66
|
+
) -> str:
|
|
61
67
|
"""
|
|
62
|
-
Creates a dataset uri based on dataset name and optionally
|
|
68
|
+
Creates a dataset uri based on namespace, project, dataset name and optionally
|
|
69
|
+
version.
|
|
63
70
|
Example:
|
|
64
|
-
Input: zalando, 3.0.1
|
|
65
|
-
Output: ds//zalando@v3.0.1
|
|
71
|
+
Input: dev, clothes, zalando, 3.0.1
|
|
72
|
+
Output: ds//dev.clothes.zalando@v3.0.1
|
|
66
73
|
"""
|
|
67
|
-
uri = f"{DATASET_PREFIX}{name}"
|
|
74
|
+
uri = f"{DATASET_PREFIX}{namespace}.{project}.{name}"
|
|
68
75
|
if version:
|
|
69
76
|
uri += f"@v{version}"
|
|
70
77
|
|
|
71
78
|
return uri
|
|
72
79
|
|
|
73
80
|
|
|
81
|
+
def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
|
|
82
|
+
"""Parses dataset name and returns namespace, project and name"""
|
|
83
|
+
if not name:
|
|
84
|
+
raise ValueError("Name must be defined to parse it")
|
|
85
|
+
split = name.split(".")
|
|
86
|
+
if len(split) == 3:
|
|
87
|
+
return tuple(split) # type: ignore[return-value]
|
|
88
|
+
|
|
89
|
+
return None, None, name
|
|
90
|
+
|
|
91
|
+
|
|
74
92
|
class DatasetDependencyType:
|
|
75
93
|
DATASET = "dataset"
|
|
76
94
|
STORAGE = "storage"
|
|
@@ -78,8 +96,12 @@ class DatasetDependencyType:
|
|
|
78
96
|
|
|
79
97
|
@dataclass
|
|
80
98
|
class DatasetDependency:
|
|
99
|
+
# TODO put `DatasetRecord` instead of name + version which will
|
|
100
|
+
# simplify codebase in various places
|
|
81
101
|
id: int
|
|
82
102
|
type: str
|
|
103
|
+
namespace: str
|
|
104
|
+
project: str
|
|
83
105
|
name: str
|
|
84
106
|
version: str
|
|
85
107
|
created_at: datetime
|
|
@@ -100,6 +122,8 @@ class DatasetDependency:
|
|
|
100
122
|
@classmethod
|
|
101
123
|
def parse(
|
|
102
124
|
cls: builtins.type[DD],
|
|
125
|
+
namespace_name: str,
|
|
126
|
+
project_name: str,
|
|
103
127
|
id: int,
|
|
104
128
|
dataset_id: Optional[int],
|
|
105
129
|
dataset_version_id: Optional[int],
|
|
@@ -121,6 +145,8 @@ class DatasetDependency:
|
|
|
121
145
|
if is_listing_dataset(dataset_name)
|
|
122
146
|
else DatasetDependencyType.DATASET
|
|
123
147
|
),
|
|
148
|
+
namespace_name,
|
|
149
|
+
project_name,
|
|
124
150
|
dataset_name,
|
|
125
151
|
(
|
|
126
152
|
dataset_version # type: ignore[arg-type]
|
|
@@ -335,6 +361,7 @@ class DatasetListVersion:
|
|
|
335
361
|
class DatasetRecord:
|
|
336
362
|
id: int
|
|
337
363
|
name: str
|
|
364
|
+
project: Project
|
|
338
365
|
description: Optional[str]
|
|
339
366
|
attrs: list[str]
|
|
340
367
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
@@ -349,6 +376,9 @@ class DatasetRecord:
|
|
|
349
376
|
sources: str = ""
|
|
350
377
|
query_script: str = ""
|
|
351
378
|
|
|
379
|
+
def __hash__(self):
|
|
380
|
+
return hash(f"{self.id}")
|
|
381
|
+
|
|
352
382
|
@staticmethod
|
|
353
383
|
def parse_schema(
|
|
354
384
|
ct: dict[str, Any],
|
|
@@ -358,10 +388,31 @@ class DatasetRecord:
|
|
|
358
388
|
for c_name, c_type in ct.items()
|
|
359
389
|
}
|
|
360
390
|
|
|
391
|
+
@staticmethod
|
|
392
|
+
def validate_name(name: str) -> None:
|
|
393
|
+
"""Throws exception if name has reserved characters"""
|
|
394
|
+
for c in DATASET_NAME_RESERVED_CHARS:
|
|
395
|
+
if c in name:
|
|
396
|
+
raise InvalidDatasetNameError(
|
|
397
|
+
f"Character {c} is reserved and not allowed in dataset name"
|
|
398
|
+
)
|
|
399
|
+
|
|
361
400
|
@classmethod
|
|
362
401
|
def parse( # noqa: PLR0913
|
|
363
402
|
cls,
|
|
364
|
-
|
|
403
|
+
namespace_id: int,
|
|
404
|
+
namespace_uuid: str,
|
|
405
|
+
namespace_name: str,
|
|
406
|
+
namespace_description: Optional[str],
|
|
407
|
+
namespace_created_at: datetime,
|
|
408
|
+
project_id: int,
|
|
409
|
+
project_uuid: str,
|
|
410
|
+
project_name: str,
|
|
411
|
+
project_description: Optional[str],
|
|
412
|
+
project_created_at: datetime,
|
|
413
|
+
project_namespace_id: int,
|
|
414
|
+
dataset_id: int,
|
|
415
|
+
dataset_project_id: int,
|
|
365
416
|
name: str,
|
|
366
417
|
description: Optional[str],
|
|
367
418
|
attrs: str,
|
|
@@ -400,6 +451,23 @@ class DatasetRecord:
|
|
|
400
451
|
json.loads(version_schema) if version_schema else {}
|
|
401
452
|
)
|
|
402
453
|
|
|
454
|
+
namespace = Namespace(
|
|
455
|
+
namespace_id,
|
|
456
|
+
namespace_uuid,
|
|
457
|
+
namespace_name,
|
|
458
|
+
namespace_description,
|
|
459
|
+
namespace_created_at,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
project = Project(
|
|
463
|
+
project_id,
|
|
464
|
+
project_uuid,
|
|
465
|
+
project_name,
|
|
466
|
+
project_description,
|
|
467
|
+
project_created_at,
|
|
468
|
+
namespace,
|
|
469
|
+
)
|
|
470
|
+
|
|
403
471
|
dataset_version = DatasetVersion.parse(
|
|
404
472
|
version_id,
|
|
405
473
|
version_uuid,
|
|
@@ -422,8 +490,9 @@ class DatasetRecord:
|
|
|
422
490
|
)
|
|
423
491
|
|
|
424
492
|
return cls(
|
|
425
|
-
|
|
493
|
+
dataset_id,
|
|
426
494
|
name,
|
|
495
|
+
project,
|
|
427
496
|
description,
|
|
428
497
|
attrs_lst,
|
|
429
498
|
cls.parse_schema(schema_dct), # type: ignore[arg-type]
|
|
@@ -448,6 +517,10 @@ class DatasetRecord:
|
|
|
448
517
|
for c_name, c_type in self.schema.items()
|
|
449
518
|
}
|
|
450
519
|
|
|
520
|
+
@property
|
|
521
|
+
def full_name(self) -> str:
|
|
522
|
+
return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
|
|
523
|
+
|
|
451
524
|
def get_schema(self, version: str) -> dict[str, Union[SQLType, type[SQLType]]]:
|
|
452
525
|
return self.get_version(version).schema if version else self.schema
|
|
453
526
|
|
|
@@ -527,7 +600,10 @@ class DatasetRecord:
|
|
|
527
600
|
Dataset uri example: ds://dogs@v3.0.1
|
|
528
601
|
"""
|
|
529
602
|
identifier = self.identifier(version)
|
|
530
|
-
return
|
|
603
|
+
return (
|
|
604
|
+
f"{DATASET_PREFIX}{self.project.namespace.name}"
|
|
605
|
+
f".{self.project.name}.{identifier}"
|
|
606
|
+
)
|
|
531
607
|
|
|
532
608
|
@property
|
|
533
609
|
def next_version_major(self) -> str:
|
|
@@ -592,15 +668,17 @@ class DatasetRecord:
|
|
|
592
668
|
|
|
593
669
|
@classmethod
|
|
594
670
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
|
|
671
|
+
project = Project.from_dict(d.pop("project"))
|
|
595
672
|
versions = [DatasetVersion.from_dict(v) for v in d.pop("versions", [])]
|
|
596
673
|
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
597
|
-
return cls(**kwargs, versions=versions)
|
|
674
|
+
return cls(**kwargs, versions=versions, project=project)
|
|
598
675
|
|
|
599
676
|
|
|
600
677
|
@dataclass
|
|
601
678
|
class DatasetListRecord:
|
|
602
679
|
id: int
|
|
603
680
|
name: str
|
|
681
|
+
project: Project
|
|
604
682
|
description: Optional[str]
|
|
605
683
|
attrs: list[str]
|
|
606
684
|
versions: list[DatasetListVersion]
|
|
@@ -609,7 +687,18 @@ class DatasetListRecord:
|
|
|
609
687
|
@classmethod
|
|
610
688
|
def parse( # noqa: PLR0913
|
|
611
689
|
cls,
|
|
612
|
-
|
|
690
|
+
namespace_id: int,
|
|
691
|
+
namespace_uuid: str,
|
|
692
|
+
namespace_name: str,
|
|
693
|
+
namespace_description: Optional[str],
|
|
694
|
+
namespace_created_at: datetime,
|
|
695
|
+
project_id: int,
|
|
696
|
+
project_uuid: str,
|
|
697
|
+
project_name: str,
|
|
698
|
+
project_description: Optional[str],
|
|
699
|
+
project_created_at: datetime,
|
|
700
|
+
project_namespace_id: int,
|
|
701
|
+
dataset_id: int,
|
|
613
702
|
name: str,
|
|
614
703
|
description: Optional[str],
|
|
615
704
|
attrs: str,
|
|
@@ -630,6 +719,23 @@ class DatasetListRecord:
|
|
|
630
719
|
) -> "DatasetListRecord":
|
|
631
720
|
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
632
721
|
|
|
722
|
+
namespace = Namespace(
|
|
723
|
+
namespace_id,
|
|
724
|
+
namespace_uuid,
|
|
725
|
+
namespace_name,
|
|
726
|
+
namespace_description,
|
|
727
|
+
namespace_created_at,
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
project = Project(
|
|
731
|
+
project_id,
|
|
732
|
+
project_uuid,
|
|
733
|
+
project_name,
|
|
734
|
+
project_description,
|
|
735
|
+
project_created_at,
|
|
736
|
+
namespace,
|
|
737
|
+
)
|
|
738
|
+
|
|
633
739
|
dataset_version = DatasetListVersion.parse(
|
|
634
740
|
version_id,
|
|
635
741
|
version_uuid,
|
|
@@ -647,14 +753,19 @@ class DatasetListRecord:
|
|
|
647
753
|
)
|
|
648
754
|
|
|
649
755
|
return cls(
|
|
650
|
-
|
|
756
|
+
dataset_id,
|
|
651
757
|
name,
|
|
758
|
+
project,
|
|
652
759
|
description,
|
|
653
760
|
attrs_lst,
|
|
654
761
|
[dataset_version],
|
|
655
762
|
created_at,
|
|
656
763
|
)
|
|
657
764
|
|
|
765
|
+
@property
|
|
766
|
+
def full_name(self) -> str:
|
|
767
|
+
return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
|
|
768
|
+
|
|
658
769
|
def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
|
|
659
770
|
"""Merge versions from another dataset"""
|
|
660
771
|
if other.id != self.id:
|
|
@@ -691,9 +802,11 @@ class DatasetListRecord:
|
|
|
691
802
|
|
|
692
803
|
@classmethod
|
|
693
804
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetListRecord":
|
|
805
|
+
project = Project.from_dict(d.pop("project"))
|
|
694
806
|
versions = [DatasetListVersion.parse(**v) for v in d.get("versions", [])]
|
|
695
807
|
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
696
808
|
kwargs["versions"] = versions
|
|
809
|
+
kwargs["project"] = project
|
|
697
810
|
return cls(**kwargs)
|
|
698
811
|
|
|
699
812
|
|
datachain/delta.py
CHANGED
|
@@ -56,8 +56,10 @@ def _get_delta_chain(
|
|
|
56
56
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
57
57
|
) -> "DataChain":
|
|
58
58
|
"""Get delta chain for processing changes between versions."""
|
|
59
|
-
source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
|
|
60
|
-
source_dc_latest = datachain.read_dataset(
|
|
59
|
+
source_dc = datachain.read_dataset(source_ds_name, version=source_ds_version)
|
|
60
|
+
source_dc_latest = datachain.read_dataset(
|
|
61
|
+
source_ds_name, version=source_ds_latest_version
|
|
62
|
+
)
|
|
61
63
|
|
|
62
64
|
# Calculate diff between source versions
|
|
63
65
|
return source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
|
|
@@ -79,8 +81,10 @@ def _get_retry_chain(
|
|
|
79
81
|
retry_chain = None
|
|
80
82
|
|
|
81
83
|
# Read the latest version of the result dataset for retry logic
|
|
82
|
-
result_dataset = datachain.read_dataset(name, latest_version)
|
|
83
|
-
source_dc_latest = datachain.read_dataset(
|
|
84
|
+
result_dataset = datachain.read_dataset(name, version=latest_version)
|
|
85
|
+
source_dc_latest = datachain.read_dataset(
|
|
86
|
+
source_ds_name, version=source_ds_latest_version
|
|
87
|
+
)
|
|
84
88
|
|
|
85
89
|
# Handle error records if delta_retry is a string (column name)
|
|
86
90
|
if isinstance(delta_retry, str):
|
|
@@ -232,7 +236,7 @@ def delta_retry_update(
|
|
|
232
236
|
if processing_chain is None or (processing_chain and processing_chain.empty):
|
|
233
237
|
return None, None, False
|
|
234
238
|
|
|
235
|
-
latest_dataset = datachain.read_dataset(name, latest_version)
|
|
239
|
+
latest_dataset = datachain.read_dataset(name, version=latest_version)
|
|
236
240
|
compared_chain = latest_dataset.compare(
|
|
237
241
|
processing_chain,
|
|
238
242
|
on=right_on or on,
|
datachain/error.py
CHANGED
|
@@ -2,10 +2,42 @@ class DataChainError(RuntimeError):
|
|
|
2
2
|
pass
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
class InvalidDatasetNameError(RuntimeError):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class InvalidNamespaceNameError(RuntimeError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InvalidProjectNameError(RuntimeError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
5
17
|
class NotFoundError(Exception):
|
|
6
18
|
pass
|
|
7
19
|
|
|
8
20
|
|
|
21
|
+
class NamespaceNotFoundError(NotFoundError):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NotAllowedError(Exception):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class NamespaceCreateNotAllowedError(NotAllowedError):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ProjectCreateNotAllowedError(NotAllowedError):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ProjectNotFoundError(NotFoundError):
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
9
41
|
class DatasetNotFoundError(NotFoundError):
|
|
10
42
|
pass
|
|
11
43
|
|
|
@@ -53,3 +85,7 @@ class ClientError(RuntimeError):
|
|
|
53
85
|
|
|
54
86
|
class TableMissingError(DataChainError):
|
|
55
87
|
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class OutdatedDatabaseSchemaError(DataChainError):
|
|
91
|
+
pass
|