datachain 0.20.4__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -2
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +65 -180
- datachain/cli/__init__.py +7 -0
- datachain/cli/commands/datasets.py +28 -43
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +35 -1
- datachain/client/fsspec.py +3 -5
- datachain/client/hf.py +0 -10
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +37 -405
- datachain/data_storage/sqlite.py +7 -136
- datachain/data_storage/warehouse.py +7 -26
- datachain/dataset.py +12 -126
- datachain/delta.py +7 -11
- datachain/error.py +0 -36
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +0 -4
- datachain/lib/dc/datachain.py +92 -260
- datachain/lib/dc/datasets.py +50 -104
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +0 -1
- datachain/lib/dc/storage.py +40 -38
- datachain/lib/file.py +23 -77
- datachain/lib/listing.py +1 -3
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +0 -10
- datachain/lib/tar.py +2 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +20 -30
- datachain/listing.py +1 -3
- datachain/query/dataset.py +46 -71
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +26 -61
- datachain/studio.py +7 -23
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
- datachain/lib/namespaces.py +0 -71
- datachain/lib/projects.py +0 -86
- datachain/namespace.py +0 -65
- datachain/project.py +0 -78
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
import sqlite3
|
|
4
4
|
from collections.abc import Iterable, Sequence
|
|
5
5
|
from contextlib import contextmanager
|
|
6
|
-
from functools import
|
|
6
|
+
from functools import wraps
|
|
7
7
|
from time import sleep
|
|
8
8
|
from typing import (
|
|
9
9
|
TYPE_CHECKING,
|
|
@@ -15,15 +15,7 @@ from typing import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
import sqlalchemy
|
|
18
|
-
from sqlalchemy import
|
|
19
|
-
Column,
|
|
20
|
-
Integer,
|
|
21
|
-
MetaData,
|
|
22
|
-
Table,
|
|
23
|
-
UniqueConstraint,
|
|
24
|
-
exists,
|
|
25
|
-
select,
|
|
26
|
-
)
|
|
18
|
+
from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
|
|
27
19
|
from sqlalchemy.dialects import sqlite
|
|
28
20
|
from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
|
|
29
21
|
from sqlalchemy.sql import func
|
|
@@ -38,9 +30,7 @@ from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
|
|
|
38
30
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
39
31
|
from datachain.data_storage.schema import DefaultSchema
|
|
40
32
|
from datachain.dataset import DatasetRecord, StorageURI
|
|
41
|
-
from datachain.error import DataChainError
|
|
42
|
-
from datachain.namespace import Namespace
|
|
43
|
-
from datachain.project import Project
|
|
33
|
+
from datachain.error import DataChainError
|
|
44
34
|
from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
|
|
45
35
|
from datachain.sql.sqlite.base import load_usearch_extension
|
|
46
36
|
from datachain.sql.types import SQLType
|
|
@@ -70,14 +60,6 @@ datachain.sql.sqlite.setup()
|
|
|
70
60
|
quote_schema = sqlite_dialect.identifier_preparer.quote_schema
|
|
71
61
|
quote = sqlite_dialect.identifier_preparer.quote
|
|
72
62
|
|
|
73
|
-
# NOTE! This should be manually increased when we change our DB schema in codebase
|
|
74
|
-
SCHEMA_VERSION = 1
|
|
75
|
-
|
|
76
|
-
OUTDATED_SCHEMA_ERROR_MESSAGE = (
|
|
77
|
-
"You have an old version of the database schema. Please refer to the documentation"
|
|
78
|
-
" for more information."
|
|
79
|
-
)
|
|
80
|
-
|
|
81
63
|
|
|
82
64
|
def _get_in_memory_uri():
|
|
83
65
|
return "file::memory:?cache=shared"
|
|
@@ -321,11 +303,6 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
321
303
|
)
|
|
322
304
|
return bool(next(self.execute(query))[0])
|
|
323
305
|
|
|
324
|
-
@property
|
|
325
|
-
def table_names(self) -> list[str]:
|
|
326
|
-
query = "SELECT name FROM sqlite_master WHERE type='table';"
|
|
327
|
-
return [r[0] for r in self.execute_str(query).fetchall()]
|
|
328
|
-
|
|
329
306
|
def create_table(self, table: "Table", if_not_exists: bool = True) -> None:
|
|
330
307
|
self.execute(CreateTable(table, if_not_exists=if_not_exists))
|
|
331
308
|
|
|
@@ -344,8 +321,6 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
344
321
|
This is currently used for the local cli.
|
|
345
322
|
"""
|
|
346
323
|
|
|
347
|
-
META_TABLE = "meta"
|
|
348
|
-
|
|
349
324
|
db: "SQLiteDatabaseEngine"
|
|
350
325
|
|
|
351
326
|
def __init__(
|
|
@@ -367,11 +342,7 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
367
342
|
|
|
368
343
|
self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
|
|
369
344
|
|
|
370
|
-
self._init_meta_table()
|
|
371
|
-
self._init_meta_schema_value()
|
|
372
|
-
self._check_schema_version()
|
|
373
345
|
self._init_tables()
|
|
374
|
-
self._init_namespaces_projects()
|
|
375
346
|
|
|
376
347
|
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
377
348
|
"""Close connection upon exit from context manager."""
|
|
@@ -412,44 +383,8 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
412
383
|
(db_class, db_args, db_kwargs) = db_clone_params
|
|
413
384
|
return cls(uri=uri, db=db_class(*db_args, **db_kwargs))
|
|
414
385
|
|
|
415
|
-
@cached_property
|
|
416
|
-
def _meta(self) -> Table:
|
|
417
|
-
return Table(self.META_TABLE, self.db.metadata, *self._meta_columns())
|
|
418
|
-
|
|
419
|
-
def _meta_select(self, *columns) -> "Select":
|
|
420
|
-
if not columns:
|
|
421
|
-
return self._meta.select()
|
|
422
|
-
return select(*columns)
|
|
423
|
-
|
|
424
|
-
def _meta_insert(self) -> "Insert":
|
|
425
|
-
return sqlite.insert(self._meta)
|
|
426
|
-
|
|
427
|
-
def _init_meta_table(self) -> None:
|
|
428
|
-
"""Initializes meta table"""
|
|
429
|
-
# NOTE! needs to be called before _init_tables()
|
|
430
|
-
table_names = self.db.table_names
|
|
431
|
-
if table_names and self.META_TABLE not in table_names:
|
|
432
|
-
# this will happen on first run
|
|
433
|
-
raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
|
|
434
|
-
|
|
435
|
-
self.db.create_table(self._meta, if_not_exists=True)
|
|
436
|
-
self.default_table_names.append(self._meta.name)
|
|
437
|
-
|
|
438
|
-
def _init_meta_schema_value(self) -> None:
|
|
439
|
-
"""Inserts current schema version value if not present in meta table yet"""
|
|
440
|
-
stmt = (
|
|
441
|
-
self._meta_insert()
|
|
442
|
-
.values(id=1, schema_version=SCHEMA_VERSION)
|
|
443
|
-
.on_conflict_do_nothing(index_elements=["id"])
|
|
444
|
-
)
|
|
445
|
-
self.db.execute(stmt)
|
|
446
|
-
|
|
447
386
|
def _init_tables(self) -> None:
|
|
448
387
|
"""Initialize tables."""
|
|
449
|
-
self.db.create_table(self._namespaces, if_not_exists=True)
|
|
450
|
-
self.default_table_names.append(self._namespaces.name)
|
|
451
|
-
self.db.create_table(self._projects, if_not_exists=True)
|
|
452
|
-
self.default_table_names.append(self._projects.name)
|
|
453
388
|
self.db.create_table(self._datasets, if_not_exists=True)
|
|
454
389
|
self.default_table_names.append(self._datasets.name)
|
|
455
390
|
self.db.create_table(self._datasets_versions, if_not_exists=True)
|
|
@@ -459,52 +394,10 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
459
394
|
self.db.create_table(self._jobs, if_not_exists=True)
|
|
460
395
|
self.default_table_names.append(self._jobs.name)
|
|
461
396
|
|
|
462
|
-
def _init_namespaces_projects(self) -> None:
|
|
463
|
-
"""
|
|
464
|
-
Creates local namespace and local project connected to it.
|
|
465
|
-
In local environment user cannot explicitly create other namespaces and
|
|
466
|
-
projects and all datasets user creates will be stored in those.
|
|
467
|
-
When pulling dataset from Studio, then other namespaces and projects will
|
|
468
|
-
be created implicitly though, to keep the same fully qualified name with
|
|
469
|
-
Studio dataset.
|
|
470
|
-
"""
|
|
471
|
-
system_namespace = self.create_namespace(Namespace.system(), "System namespace")
|
|
472
|
-
self.create_project(system_namespace.name, Project.listing(), "Listing project")
|
|
473
|
-
|
|
474
|
-
def _check_schema_version(self) -> None:
|
|
475
|
-
"""
|
|
476
|
-
Checks if current DB schema is up to date with latest DB model and schema
|
|
477
|
-
version. If not, OutdatedDatabaseSchemaError is raised.
|
|
478
|
-
"""
|
|
479
|
-
schema_version = next(self.db.execute(self._meta_select()))[1]
|
|
480
|
-
if schema_version < SCHEMA_VERSION:
|
|
481
|
-
raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
|
|
482
|
-
|
|
483
|
-
#
|
|
484
|
-
# Dataset dependencies
|
|
485
|
-
#
|
|
486
|
-
@classmethod
|
|
487
|
-
def _meta_columns(cls) -> list["SchemaItem"]:
|
|
488
|
-
return [
|
|
489
|
-
Column("id", Integer, primary_key=True),
|
|
490
|
-
Column("schema_version", Integer, default=SCHEMA_VERSION),
|
|
491
|
-
]
|
|
492
|
-
|
|
493
397
|
@classmethod
|
|
494
398
|
def _datasets_columns(cls) -> list["SchemaItem"]:
|
|
495
399
|
"""Datasets table columns."""
|
|
496
|
-
return [*super()._datasets_columns(), UniqueConstraint("
|
|
497
|
-
|
|
498
|
-
@classmethod
|
|
499
|
-
def _namespaces_columns(cls) -> list["SchemaItem"]:
|
|
500
|
-
"""Datasets table columns."""
|
|
501
|
-
return [*super()._namespaces_columns(), UniqueConstraint("name")]
|
|
502
|
-
|
|
503
|
-
def _namespaces_insert(self) -> "Insert":
|
|
504
|
-
return sqlite.insert(self._namespaces)
|
|
505
|
-
|
|
506
|
-
def _projects_insert(self) -> "Insert":
|
|
507
|
-
return sqlite.insert(self._projects)
|
|
400
|
+
return [*super()._datasets_columns(), UniqueConstraint("name")]
|
|
508
401
|
|
|
509
402
|
def _datasets_insert(self) -> "Insert":
|
|
510
403
|
return sqlite.insert(self._datasets)
|
|
@@ -521,8 +414,6 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
521
414
|
|
|
522
415
|
def _dataset_dependencies_select_columns(self) -> list["SchemaItem"]:
|
|
523
416
|
return [
|
|
524
|
-
self._namespaces.c.name,
|
|
525
|
-
self._projects.c.name,
|
|
526
417
|
self._datasets_dependencies.c.id,
|
|
527
418
|
self._datasets_dependencies.c.dataset_id,
|
|
528
419
|
self._datasets_dependencies.c.dataset_version_id,
|
|
@@ -538,26 +429,6 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
538
429
|
def _jobs_insert(self) -> "Insert":
|
|
539
430
|
return sqlite.insert(self._jobs)
|
|
540
431
|
|
|
541
|
-
@property
|
|
542
|
-
def is_studio(self) -> bool:
|
|
543
|
-
return False
|
|
544
|
-
|
|
545
|
-
#
|
|
546
|
-
# Namespaces
|
|
547
|
-
#
|
|
548
|
-
|
|
549
|
-
@property
|
|
550
|
-
def default_namespace_name(self):
|
|
551
|
-
return Namespace.default()
|
|
552
|
-
|
|
553
|
-
#
|
|
554
|
-
# Projects
|
|
555
|
-
#
|
|
556
|
-
|
|
557
|
-
@property
|
|
558
|
-
def default_project_name(self):
|
|
559
|
-
return Project.default()
|
|
560
|
-
|
|
561
432
|
|
|
562
433
|
class SQLiteWarehouse(AbstractWarehouse):
|
|
563
434
|
"""
|
|
@@ -663,16 +534,16 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
663
534
|
) -> None:
|
|
664
535
|
dst_empty = False
|
|
665
536
|
|
|
666
|
-
if not self.db.has_table(self.dataset_table_name(src, src_version)):
|
|
537
|
+
if not self.db.has_table(self.dataset_table_name(src.name, src_version)):
|
|
667
538
|
# source table doesn't exist, nothing to do
|
|
668
539
|
return
|
|
669
540
|
|
|
670
541
|
src_dr = self.dataset_rows(src, src_version).table
|
|
671
542
|
|
|
672
|
-
if not self.db.has_table(self.dataset_table_name(dst, dst_version)):
|
|
543
|
+
if not self.db.has_table(self.dataset_table_name(dst.name, dst_version)):
|
|
673
544
|
# destination table doesn't exist, create it
|
|
674
545
|
self.create_dataset_rows_table(
|
|
675
|
-
self.dataset_table_name(dst, dst_version),
|
|
546
|
+
self.dataset_table_name(dst.name, dst_version),
|
|
676
547
|
columns=src_dr.columns,
|
|
677
548
|
)
|
|
678
549
|
dst_empty = True
|
|
@@ -182,7 +182,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
182
182
|
):
|
|
183
183
|
version = version or dataset.latest_version
|
|
184
184
|
|
|
185
|
-
table_name = self.dataset_table_name(dataset, version)
|
|
185
|
+
table_name = self.dataset_table_name(dataset.name, version)
|
|
186
186
|
return self.schema.dataset_row_cls(
|
|
187
187
|
table_name,
|
|
188
188
|
self.db,
|
|
@@ -254,24 +254,12 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
254
254
|
name = parsed.path if parsed.scheme == "file" else parsed.netloc
|
|
255
255
|
return parsed.scheme, name
|
|
256
256
|
|
|
257
|
-
def dataset_table_name(self,
|
|
258
|
-
return self._construct_dataset_table_name(
|
|
259
|
-
dataset.project.namespace.name,
|
|
260
|
-
dataset.project.name,
|
|
261
|
-
dataset.name,
|
|
262
|
-
version,
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
def _construct_dataset_table_name(
|
|
266
|
-
self, namespace: str, project: str, dataset_name: str, version: str
|
|
267
|
-
) -> str:
|
|
257
|
+
def dataset_table_name(self, dataset_name: str, version: str) -> str:
|
|
268
258
|
prefix = self.DATASET_TABLE_PREFIX
|
|
269
259
|
if Client.is_data_source_uri(dataset_name):
|
|
270
260
|
# for datasets that are created for bucket listing we use different prefix
|
|
271
261
|
prefix = self.DATASET_SOURCE_TABLE_PREFIX
|
|
272
|
-
return (
|
|
273
|
-
f"{prefix}{namespace}_{project}_{dataset_name}_{version.replace('.', '_')}"
|
|
274
|
-
)
|
|
262
|
+
return f"{prefix}{dataset_name}_{version.replace('.', '_')}"
|
|
275
263
|
|
|
276
264
|
def temp_table_name(self) -> str:
|
|
277
265
|
return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
|
|
@@ -299,7 +287,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
299
287
|
if_exists: bool = True,
|
|
300
288
|
) -> None:
|
|
301
289
|
"""Drops a dataset rows table for the given dataset name."""
|
|
302
|
-
table_name = self.dataset_table_name(dataset, version)
|
|
290
|
+
table_name = self.dataset_table_name(dataset.name, version)
|
|
303
291
|
table = sa.Table(table_name, self.db.metadata)
|
|
304
292
|
self.db.drop_table(table, if_exists=if_exists)
|
|
305
293
|
|
|
@@ -356,20 +344,13 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
356
344
|
|
|
357
345
|
def rename_dataset_table(
|
|
358
346
|
self,
|
|
359
|
-
dataset: DatasetRecord,
|
|
360
347
|
old_name: str,
|
|
361
348
|
new_name: str,
|
|
362
349
|
old_version: str,
|
|
363
350
|
new_version: str,
|
|
364
351
|
) -> None:
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
old_ds_table_name = self._construct_dataset_table_name(
|
|
368
|
-
namespace, project, old_name, old_version
|
|
369
|
-
)
|
|
370
|
-
new_ds_table_name = self._construct_dataset_table_name(
|
|
371
|
-
namespace, project, new_name, new_version
|
|
372
|
-
)
|
|
352
|
+
old_ds_table_name = self.dataset_table_name(old_name, old_version)
|
|
353
|
+
new_ds_table_name = self.dataset_table_name(new_name, new_version)
|
|
373
354
|
|
|
374
355
|
self.db.rename_table(old_ds_table_name, new_ds_table_name)
|
|
375
356
|
|
|
@@ -387,7 +368,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
387
368
|
"""
|
|
388
369
|
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
389
370
|
"""
|
|
390
|
-
if not (self.db.has_table(self.dataset_table_name(dataset, version))):
|
|
371
|
+
if not (self.db.has_table(self.dataset_table_name(dataset.name, version))):
|
|
391
372
|
return None, None
|
|
392
373
|
|
|
393
374
|
file_signals = list(
|
datachain/dataset.py
CHANGED
|
@@ -13,9 +13,7 @@ from typing import (
|
|
|
13
13
|
from urllib.parse import urlparse
|
|
14
14
|
|
|
15
15
|
from datachain import semver
|
|
16
|
-
from datachain.error import DatasetVersionNotFoundError
|
|
17
|
-
from datachain.namespace import Namespace
|
|
18
|
-
from datachain.project import Project
|
|
16
|
+
from datachain.error import DatasetVersionNotFoundError
|
|
19
17
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
20
18
|
|
|
21
19
|
T = TypeVar("T", bound="DatasetRecord")
|
|
@@ -29,8 +27,6 @@ QUERY_DATASET_PREFIX = "ds_query_"
|
|
|
29
27
|
LISTING_PREFIX = "lst__"
|
|
30
28
|
|
|
31
29
|
DEFAULT_DATASET_VERSION = "1.0.0"
|
|
32
|
-
DATASET_NAME_RESERVED_CHARS = ["."]
|
|
33
|
-
DATASET_NAME_REPLACEMENT_CHAR = "_"
|
|
34
30
|
|
|
35
31
|
|
|
36
32
|
# StorageURI represents a normalised URI to a valid storage location (full bucket or
|
|
@@ -61,35 +57,20 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
|
|
|
61
57
|
return name, s[1]
|
|
62
58
|
|
|
63
59
|
|
|
64
|
-
def create_dataset_uri(
|
|
65
|
-
name: str, namespace: str, project: str, version: Optional[str] = None
|
|
66
|
-
) -> str:
|
|
60
|
+
def create_dataset_uri(name: str, version: Optional[str] = None) -> str:
|
|
67
61
|
"""
|
|
68
|
-
Creates a dataset uri based on
|
|
69
|
-
version.
|
|
62
|
+
Creates a dataset uri based on dataset name and optionally version
|
|
70
63
|
Example:
|
|
71
|
-
Input:
|
|
72
|
-
Output: ds//
|
|
64
|
+
Input: zalando, 3.0.1
|
|
65
|
+
Output: ds//zalando@v3.0.1
|
|
73
66
|
"""
|
|
74
|
-
uri = f"{DATASET_PREFIX}{
|
|
67
|
+
uri = f"{DATASET_PREFIX}{name}"
|
|
75
68
|
if version:
|
|
76
69
|
uri += f"@v{version}"
|
|
77
70
|
|
|
78
71
|
return uri
|
|
79
72
|
|
|
80
73
|
|
|
81
|
-
def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
|
|
82
|
-
"""Parses dataset name and returns namespace, project and name"""
|
|
83
|
-
if not name:
|
|
84
|
-
raise ValueError("Name must be defined to parse it")
|
|
85
|
-
split = name.split(".")
|
|
86
|
-
name = split[-1]
|
|
87
|
-
project_name = split[-2] if len(split) > 1 else None
|
|
88
|
-
namespace_name = split[-3] if len(split) > 2 else None
|
|
89
|
-
|
|
90
|
-
return namespace_name, project_name, name
|
|
91
|
-
|
|
92
|
-
|
|
93
74
|
class DatasetDependencyType:
|
|
94
75
|
DATASET = "dataset"
|
|
95
76
|
STORAGE = "storage"
|
|
@@ -97,12 +78,8 @@ class DatasetDependencyType:
|
|
|
97
78
|
|
|
98
79
|
@dataclass
|
|
99
80
|
class DatasetDependency:
|
|
100
|
-
# TODO put `DatasetRecord` instead of name + version which will
|
|
101
|
-
# simplify codebase in various places
|
|
102
81
|
id: int
|
|
103
82
|
type: str
|
|
104
|
-
namespace: str
|
|
105
|
-
project: str
|
|
106
83
|
name: str
|
|
107
84
|
version: str
|
|
108
85
|
created_at: datetime
|
|
@@ -123,8 +100,6 @@ class DatasetDependency:
|
|
|
123
100
|
@classmethod
|
|
124
101
|
def parse(
|
|
125
102
|
cls: builtins.type[DD],
|
|
126
|
-
namespace_name: str,
|
|
127
|
-
project_name: str,
|
|
128
103
|
id: int,
|
|
129
104
|
dataset_id: Optional[int],
|
|
130
105
|
dataset_version_id: Optional[int],
|
|
@@ -146,8 +121,6 @@ class DatasetDependency:
|
|
|
146
121
|
if is_listing_dataset(dataset_name)
|
|
147
122
|
else DatasetDependencyType.DATASET
|
|
148
123
|
),
|
|
149
|
-
namespace_name,
|
|
150
|
-
project_name,
|
|
151
124
|
dataset_name,
|
|
152
125
|
(
|
|
153
126
|
dataset_version # type: ignore[arg-type]
|
|
@@ -362,7 +335,6 @@ class DatasetListVersion:
|
|
|
362
335
|
class DatasetRecord:
|
|
363
336
|
id: int
|
|
364
337
|
name: str
|
|
365
|
-
project: Project
|
|
366
338
|
description: Optional[str]
|
|
367
339
|
attrs: list[str]
|
|
368
340
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
@@ -377,9 +349,6 @@ class DatasetRecord:
|
|
|
377
349
|
sources: str = ""
|
|
378
350
|
query_script: str = ""
|
|
379
351
|
|
|
380
|
-
def __hash__(self):
|
|
381
|
-
return hash(f"{self.id}")
|
|
382
|
-
|
|
383
352
|
@staticmethod
|
|
384
353
|
def parse_schema(
|
|
385
354
|
ct: dict[str, Any],
|
|
@@ -389,31 +358,10 @@ class DatasetRecord:
|
|
|
389
358
|
for c_name, c_type in ct.items()
|
|
390
359
|
}
|
|
391
360
|
|
|
392
|
-
@staticmethod
|
|
393
|
-
def validate_name(name: str) -> None:
|
|
394
|
-
"""Throws exception if name has reserved characters"""
|
|
395
|
-
for c in DATASET_NAME_RESERVED_CHARS:
|
|
396
|
-
if c in name:
|
|
397
|
-
raise InvalidDatasetNameError(
|
|
398
|
-
f"Character {c} is reserved and not allowed in dataset name"
|
|
399
|
-
)
|
|
400
|
-
|
|
401
361
|
@classmethod
|
|
402
362
|
def parse( # noqa: PLR0913
|
|
403
363
|
cls,
|
|
404
|
-
|
|
405
|
-
namespace_uuid: str,
|
|
406
|
-
namespace_name: str,
|
|
407
|
-
namespace_description: Optional[str],
|
|
408
|
-
namespace_created_at: datetime,
|
|
409
|
-
project_id: int,
|
|
410
|
-
project_uuid: str,
|
|
411
|
-
project_name: str,
|
|
412
|
-
project_description: Optional[str],
|
|
413
|
-
project_created_at: datetime,
|
|
414
|
-
project_namespace_id: int,
|
|
415
|
-
dataset_id: int,
|
|
416
|
-
dataset_project_id: int,
|
|
364
|
+
id: int,
|
|
417
365
|
name: str,
|
|
418
366
|
description: Optional[str],
|
|
419
367
|
attrs: str,
|
|
@@ -452,23 +400,6 @@ class DatasetRecord:
|
|
|
452
400
|
json.loads(version_schema) if version_schema else {}
|
|
453
401
|
)
|
|
454
402
|
|
|
455
|
-
namespace = Namespace(
|
|
456
|
-
namespace_id,
|
|
457
|
-
namespace_uuid,
|
|
458
|
-
namespace_name,
|
|
459
|
-
namespace_description,
|
|
460
|
-
namespace_created_at,
|
|
461
|
-
)
|
|
462
|
-
|
|
463
|
-
project = Project(
|
|
464
|
-
project_id,
|
|
465
|
-
project_uuid,
|
|
466
|
-
project_name,
|
|
467
|
-
project_description,
|
|
468
|
-
project_created_at,
|
|
469
|
-
namespace,
|
|
470
|
-
)
|
|
471
|
-
|
|
472
403
|
dataset_version = DatasetVersion.parse(
|
|
473
404
|
version_id,
|
|
474
405
|
version_uuid,
|
|
@@ -491,9 +422,8 @@ class DatasetRecord:
|
|
|
491
422
|
)
|
|
492
423
|
|
|
493
424
|
return cls(
|
|
494
|
-
|
|
425
|
+
id,
|
|
495
426
|
name,
|
|
496
|
-
project,
|
|
497
427
|
description,
|
|
498
428
|
attrs_lst,
|
|
499
429
|
cls.parse_schema(schema_dct), # type: ignore[arg-type]
|
|
@@ -518,10 +448,6 @@ class DatasetRecord:
|
|
|
518
448
|
for c_name, c_type in self.schema.items()
|
|
519
449
|
}
|
|
520
450
|
|
|
521
|
-
@property
|
|
522
|
-
def full_name(self) -> str:
|
|
523
|
-
return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
|
|
524
|
-
|
|
525
451
|
def get_schema(self, version: str) -> dict[str, Union[SQLType, type[SQLType]]]:
|
|
526
452
|
return self.get_version(version).schema if version else self.schema
|
|
527
453
|
|
|
@@ -601,10 +527,7 @@ class DatasetRecord:
|
|
|
601
527
|
Dataset uri example: ds://dogs@v3.0.1
|
|
602
528
|
"""
|
|
603
529
|
identifier = self.identifier(version)
|
|
604
|
-
return
|
|
605
|
-
f"{DATASET_PREFIX}{self.project.namespace.name}"
|
|
606
|
-
f".{self.project.name}.{identifier}"
|
|
607
|
-
)
|
|
530
|
+
return f"{DATASET_PREFIX}{identifier}"
|
|
608
531
|
|
|
609
532
|
@property
|
|
610
533
|
def next_version_major(self) -> str:
|
|
@@ -669,17 +592,15 @@ class DatasetRecord:
|
|
|
669
592
|
|
|
670
593
|
@classmethod
|
|
671
594
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
|
|
672
|
-
project = Project.from_dict(d.pop("project"))
|
|
673
595
|
versions = [DatasetVersion.from_dict(v) for v in d.pop("versions", [])]
|
|
674
596
|
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
675
|
-
return cls(**kwargs, versions=versions
|
|
597
|
+
return cls(**kwargs, versions=versions)
|
|
676
598
|
|
|
677
599
|
|
|
678
600
|
@dataclass
|
|
679
601
|
class DatasetListRecord:
|
|
680
602
|
id: int
|
|
681
603
|
name: str
|
|
682
|
-
project: Project
|
|
683
604
|
description: Optional[str]
|
|
684
605
|
attrs: list[str]
|
|
685
606
|
versions: list[DatasetListVersion]
|
|
@@ -688,18 +609,7 @@ class DatasetListRecord:
|
|
|
688
609
|
@classmethod
|
|
689
610
|
def parse( # noqa: PLR0913
|
|
690
611
|
cls,
|
|
691
|
-
|
|
692
|
-
namespace_uuid: str,
|
|
693
|
-
namespace_name: str,
|
|
694
|
-
namespace_description: Optional[str],
|
|
695
|
-
namespace_created_at: datetime,
|
|
696
|
-
project_id: int,
|
|
697
|
-
project_uuid: str,
|
|
698
|
-
project_name: str,
|
|
699
|
-
project_description: Optional[str],
|
|
700
|
-
project_created_at: datetime,
|
|
701
|
-
project_namespace_id: int,
|
|
702
|
-
dataset_id: int,
|
|
612
|
+
id: int,
|
|
703
613
|
name: str,
|
|
704
614
|
description: Optional[str],
|
|
705
615
|
attrs: str,
|
|
@@ -720,23 +630,6 @@ class DatasetListRecord:
|
|
|
720
630
|
) -> "DatasetListRecord":
|
|
721
631
|
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
722
632
|
|
|
723
|
-
namespace = Namespace(
|
|
724
|
-
namespace_id,
|
|
725
|
-
namespace_uuid,
|
|
726
|
-
namespace_name,
|
|
727
|
-
namespace_description,
|
|
728
|
-
namespace_created_at,
|
|
729
|
-
)
|
|
730
|
-
|
|
731
|
-
project = Project(
|
|
732
|
-
project_id,
|
|
733
|
-
project_uuid,
|
|
734
|
-
project_name,
|
|
735
|
-
project_description,
|
|
736
|
-
project_created_at,
|
|
737
|
-
namespace,
|
|
738
|
-
)
|
|
739
|
-
|
|
740
633
|
dataset_version = DatasetListVersion.parse(
|
|
741
634
|
version_id,
|
|
742
635
|
version_uuid,
|
|
@@ -754,19 +647,14 @@ class DatasetListRecord:
|
|
|
754
647
|
)
|
|
755
648
|
|
|
756
649
|
return cls(
|
|
757
|
-
|
|
650
|
+
id,
|
|
758
651
|
name,
|
|
759
|
-
project,
|
|
760
652
|
description,
|
|
761
653
|
attrs_lst,
|
|
762
654
|
[dataset_version],
|
|
763
655
|
created_at,
|
|
764
656
|
)
|
|
765
657
|
|
|
766
|
-
@property
|
|
767
|
-
def full_name(self) -> str:
|
|
768
|
-
return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
|
|
769
|
-
|
|
770
658
|
def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
|
|
771
659
|
"""Merge versions from another dataset"""
|
|
772
660
|
if other.id != self.id:
|
|
@@ -803,11 +691,9 @@ class DatasetListRecord:
|
|
|
803
691
|
|
|
804
692
|
@classmethod
|
|
805
693
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetListRecord":
|
|
806
|
-
project = Project.from_dict(d.pop("project"))
|
|
807
694
|
versions = [DatasetListVersion.parse(**v) for v in d.get("versions", [])]
|
|
808
695
|
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
809
696
|
kwargs["versions"] = versions
|
|
810
|
-
kwargs["project"] = project
|
|
811
697
|
return cls(**kwargs)
|
|
812
698
|
|
|
813
699
|
|
datachain/delta.py
CHANGED
|
@@ -56,13 +56,11 @@ def _get_delta_chain(
|
|
|
56
56
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
57
57
|
) -> "DataChain":
|
|
58
58
|
"""Get delta chain for processing changes between versions."""
|
|
59
|
-
source_dc = datachain.read_dataset(source_ds_name,
|
|
60
|
-
source_dc_latest = datachain.read_dataset(
|
|
61
|
-
source_ds_name, version=source_ds_latest_version
|
|
62
|
-
)
|
|
59
|
+
source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
|
|
60
|
+
source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
|
|
63
61
|
|
|
64
62
|
# Calculate diff between source versions
|
|
65
|
-
return source_dc_latest.
|
|
63
|
+
return source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
|
|
66
64
|
|
|
67
65
|
|
|
68
66
|
def _get_retry_chain(
|
|
@@ -81,10 +79,8 @@ def _get_retry_chain(
|
|
|
81
79
|
retry_chain = None
|
|
82
80
|
|
|
83
81
|
# Read the latest version of the result dataset for retry logic
|
|
84
|
-
result_dataset = datachain.read_dataset(name,
|
|
85
|
-
source_dc_latest = datachain.read_dataset(
|
|
86
|
-
source_ds_name, version=source_ds_latest_version
|
|
87
|
-
)
|
|
82
|
+
result_dataset = datachain.read_dataset(name, latest_version)
|
|
83
|
+
source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
|
|
88
84
|
|
|
89
85
|
# Handle error records if delta_retry is a string (column name)
|
|
90
86
|
if isinstance(delta_retry, str):
|
|
@@ -236,8 +232,8 @@ def delta_retry_update(
|
|
|
236
232
|
if processing_chain is None or (processing_chain and processing_chain.empty):
|
|
237
233
|
return None, None, False
|
|
238
234
|
|
|
239
|
-
latest_dataset = datachain.read_dataset(name,
|
|
240
|
-
compared_chain = latest_dataset.
|
|
235
|
+
latest_dataset = datachain.read_dataset(name, latest_version)
|
|
236
|
+
compared_chain = latest_dataset.compare(
|
|
241
237
|
processing_chain,
|
|
242
238
|
on=right_on or on,
|
|
243
239
|
added=True,
|
datachain/error.py
CHANGED
|
@@ -2,42 +2,10 @@ class DataChainError(RuntimeError):
|
|
|
2
2
|
pass
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
class InvalidDatasetNameError(RuntimeError):
|
|
6
|
-
pass
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class InvalidNamespaceNameError(RuntimeError):
|
|
10
|
-
pass
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class InvalidProjectNameError(RuntimeError):
|
|
14
|
-
pass
|
|
15
|
-
|
|
16
|
-
|
|
17
5
|
class NotFoundError(Exception):
|
|
18
6
|
pass
|
|
19
7
|
|
|
20
8
|
|
|
21
|
-
class NamespaceNotFoundError(NotFoundError):
|
|
22
|
-
pass
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class NotAllowedError(Exception):
|
|
26
|
-
pass
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class NamespaceCreateNotAllowedError(NotAllowedError):
|
|
30
|
-
pass
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class ProjectCreateNotAllowedError(NotAllowedError):
|
|
34
|
-
pass
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class ProjectNotFoundError(NotFoundError):
|
|
38
|
-
pass
|
|
39
|
-
|
|
40
|
-
|
|
41
9
|
class DatasetNotFoundError(NotFoundError):
|
|
42
10
|
pass
|
|
43
11
|
|
|
@@ -85,7 +53,3 @@ class ClientError(RuntimeError):
|
|
|
85
53
|
|
|
86
54
|
class TableMissingError(DataChainError):
|
|
87
55
|
pass
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
class OutdatedDatabaseSchemaError(DataChainError):
|
|
91
|
-
pass
|