datachain 0.32.3__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/checkpoint.py +44 -0
- datachain/client/fsspec.py +6 -1
- datachain/client/http.py +157 -0
- datachain/data_storage/metastore.py +137 -0
- datachain/data_storage/schema.py +1 -1
- datachain/data_storage/sqlite.py +8 -0
- datachain/error.py +4 -0
- datachain/lib/dc/datachain.py +13 -1
- {datachain-0.32.3.dist-info → datachain-0.33.0.dist-info}/METADATA +2 -1
- {datachain-0.32.3.dist-info → datachain-0.33.0.dist-info}/RECORD +14 -12
- {datachain-0.32.3.dist-info → datachain-0.33.0.dist-info}/WHEEL +0 -0
- {datachain-0.32.3.dist-info → datachain-0.33.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.32.3.dist-info → datachain-0.33.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.32.3.dist-info → datachain-0.33.0.dist-info}/top_level.txt +0 -0
datachain/checkpoint.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class Checkpoint:
|
|
9
|
+
"""
|
|
10
|
+
Represents a checkpoint within a job run.
|
|
11
|
+
|
|
12
|
+
A checkpoint marks a successfully completed stage of execution. In the event
|
|
13
|
+
of a failure, the job can resume from the most recent checkpoint rather than
|
|
14
|
+
starting over from the beginning.
|
|
15
|
+
|
|
16
|
+
Checkpoints can also be created in a "partial" mode, which indicates that the
|
|
17
|
+
work at this stage was only partially completed. For example, if a failure
|
|
18
|
+
occurs halfway through running a UDF, already computed results can still be
|
|
19
|
+
saved, allowing the job to resume from that partially completed state on
|
|
20
|
+
restart.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
id: str
|
|
24
|
+
job_id: str
|
|
25
|
+
hash: str
|
|
26
|
+
partial: bool
|
|
27
|
+
created_at: datetime
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def parse(
|
|
31
|
+
cls,
|
|
32
|
+
id: Union[str, uuid.UUID],
|
|
33
|
+
job_id: str,
|
|
34
|
+
_hash: str,
|
|
35
|
+
partial: bool,
|
|
36
|
+
created_at: datetime,
|
|
37
|
+
) -> "Checkpoint":
|
|
38
|
+
return cls(
|
|
39
|
+
str(id),
|
|
40
|
+
job_id,
|
|
41
|
+
_hash,
|
|
42
|
+
bool(partial),
|
|
43
|
+
created_at,
|
|
44
|
+
)
|
datachain/client/fsspec.py
CHANGED
|
@@ -93,10 +93,11 @@ class Client(ABC):
|
|
|
93
93
|
self.uri = self.get_uri(self.name)
|
|
94
94
|
|
|
95
95
|
@staticmethod
|
|
96
|
-
def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
|
|
96
|
+
def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]: # noqa: PLR0911
|
|
97
97
|
from .azure import AzureClient
|
|
98
98
|
from .gcs import GCSClient
|
|
99
99
|
from .hf import HfClient
|
|
100
|
+
from .http import HTTPClient, HTTPSClient
|
|
100
101
|
from .local import FileClient
|
|
101
102
|
from .s3 import ClientS3
|
|
102
103
|
|
|
@@ -114,6 +115,10 @@ class Client(ABC):
|
|
|
114
115
|
return FileClient
|
|
115
116
|
if protocol == HfClient.protocol:
|
|
116
117
|
return HfClient
|
|
118
|
+
if protocol == HTTPClient.protocol:
|
|
119
|
+
return HTTPClient
|
|
120
|
+
if protocol == HTTPSClient.protocol:
|
|
121
|
+
return HTTPSClient
|
|
117
122
|
|
|
118
123
|
raise NotImplementedError(f"Unsupported protocol: {protocol}")
|
|
119
124
|
|
datachain/client/http.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Optional, cast
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
from fsspec.implementations.http import HTTPFileSystem
|
|
6
|
+
|
|
7
|
+
from datachain.dataset import StorageURI
|
|
8
|
+
from datachain.lib.file import File
|
|
9
|
+
|
|
10
|
+
from .fsspec import Client
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from datachain.cache import Cache
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HTTPClient(Client):
|
|
17
|
+
FS_CLASS = HTTPFileSystem
|
|
18
|
+
PREFIX: ClassVar[str] = "http://"
|
|
19
|
+
protocol: ClassVar[str] = "http"
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def create_fs(cls, **kwargs) -> HTTPFileSystem:
|
|
23
|
+
# Configure HTTPFileSystem options
|
|
24
|
+
kwargs.setdefault("simple_links", True)
|
|
25
|
+
kwargs.setdefault("same_scheme", True)
|
|
26
|
+
kwargs.setdefault("cache_type", "bytes")
|
|
27
|
+
|
|
28
|
+
kwargs.pop("version_aware", None)
|
|
29
|
+
|
|
30
|
+
fs = cls.FS_CLASS(**kwargs)
|
|
31
|
+
fs.invalidate_cache()
|
|
32
|
+
return cast("HTTPFileSystem", fs)
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_name(
|
|
36
|
+
cls,
|
|
37
|
+
name: str,
|
|
38
|
+
cache: "Cache",
|
|
39
|
+
kwargs: dict[str, Any],
|
|
40
|
+
) -> "HTTPClient":
|
|
41
|
+
parsed = urlparse(name)
|
|
42
|
+
|
|
43
|
+
if parsed.scheme:
|
|
44
|
+
name = parsed.netloc + parsed.path
|
|
45
|
+
|
|
46
|
+
return cls(name, kwargs, cache)
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def split_url(cls, url: str) -> tuple[str, str]:
|
|
50
|
+
"""Split HTTP/HTTPS URL into domain (bucket equivalent) and path."""
|
|
51
|
+
parsed = urlparse(url)
|
|
52
|
+
domain = parsed.netloc
|
|
53
|
+
path = parsed.path.lstrip("/")
|
|
54
|
+
|
|
55
|
+
if parsed.query:
|
|
56
|
+
path += f"?{parsed.query}"
|
|
57
|
+
if parsed.fragment:
|
|
58
|
+
path += f"#{parsed.fragment}"
|
|
59
|
+
|
|
60
|
+
return domain, path
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def get_uri(cls, name: str) -> "StorageURI":
|
|
64
|
+
if not name.startswith(("http://", "https://")):
|
|
65
|
+
return StorageURI(f"{cls.PREFIX}{name}")
|
|
66
|
+
return StorageURI(name)
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def is_root_url(cls, url: str) -> bool:
|
|
70
|
+
parsed = urlparse(url)
|
|
71
|
+
return parsed.path in ("", "/") and not parsed.query and not parsed.fragment
|
|
72
|
+
|
|
73
|
+
def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
|
|
74
|
+
if self.name.startswith(("http://", "https://")):
|
|
75
|
+
base_url = self.name
|
|
76
|
+
else:
|
|
77
|
+
if rel_path and "/" in rel_path:
|
|
78
|
+
first_part = rel_path.split("/")[0]
|
|
79
|
+
if "." in first_part and not first_part.startswith("."):
|
|
80
|
+
return f"{self.protocol}://{rel_path}"
|
|
81
|
+
|
|
82
|
+
base_url = f"{self.protocol}://{self.name}"
|
|
83
|
+
|
|
84
|
+
if rel_path:
|
|
85
|
+
if not base_url.endswith("/") and not rel_path.startswith("/"):
|
|
86
|
+
base_url += "/"
|
|
87
|
+
full_url = base_url + rel_path
|
|
88
|
+
else:
|
|
89
|
+
full_url = base_url
|
|
90
|
+
|
|
91
|
+
return full_url
|
|
92
|
+
|
|
93
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Generate URL for the given path.
|
|
96
|
+
Note: HTTP URLs don't support signed/expiring URLs.
|
|
97
|
+
"""
|
|
98
|
+
return self.get_full_path(path, kwargs.pop("version_id", None))
|
|
99
|
+
|
|
100
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
101
|
+
etag = v.get("ETag", "").strip('"')
|
|
102
|
+
last_modified = v.get("last_modified")
|
|
103
|
+
if last_modified:
|
|
104
|
+
if isinstance(last_modified, str):
|
|
105
|
+
try:
|
|
106
|
+
from email.utils import parsedate_to_datetime
|
|
107
|
+
|
|
108
|
+
last_modified = parsedate_to_datetime(last_modified)
|
|
109
|
+
except (ValueError, TypeError):
|
|
110
|
+
last_modified = datetime.now(timezone.utc)
|
|
111
|
+
elif isinstance(last_modified, (int, float)):
|
|
112
|
+
last_modified = datetime.fromtimestamp(last_modified, timezone.utc)
|
|
113
|
+
else:
|
|
114
|
+
last_modified = datetime.now(timezone.utc)
|
|
115
|
+
|
|
116
|
+
return File(
|
|
117
|
+
source=self.uri,
|
|
118
|
+
path=path,
|
|
119
|
+
size=v.get("size", 0),
|
|
120
|
+
etag=etag,
|
|
121
|
+
version="",
|
|
122
|
+
is_latest=True,
|
|
123
|
+
last_modified=last_modified,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def upload(self, data: bytes, path: str) -> "File":
|
|
127
|
+
raise NotImplementedError(
|
|
128
|
+
"HTTP/HTTPS client is read-only. Upload operations are not supported."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
|
|
132
|
+
info = self.fs.info(self.get_full_path(path))
|
|
133
|
+
return self.info_to_file(info, path)
|
|
134
|
+
|
|
135
|
+
def open_object(self, file: "File", use_cache: bool = True, cb=None):
|
|
136
|
+
from datachain.client.fileslice import FileWrapper
|
|
137
|
+
|
|
138
|
+
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
139
|
+
return open(cache_path, mode="rb")
|
|
140
|
+
|
|
141
|
+
assert not file.location
|
|
142
|
+
return FileWrapper(
|
|
143
|
+
self.fs.open(self.get_full_path(file.get_path_normalized())),
|
|
144
|
+
cb or (lambda x: None),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
|
|
148
|
+
return await self.fs._get_file(lpath, rpath, callback=callback)
|
|
149
|
+
|
|
150
|
+
async def _fetch_dir(self, prefix: str, pbar, result_queue) -> set[str]:
|
|
151
|
+
full_url = self.get_full_path(prefix)
|
|
152
|
+
raise NotImplementedError(f"Cannot download file from {full_url}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class HTTPSClient(HTTPClient):
|
|
156
|
+
protocol = "https"
|
|
157
|
+
PREFIX = "https://"
|
|
@@ -13,6 +13,7 @@ from uuid import uuid4
|
|
|
13
13
|
from sqlalchemy import (
|
|
14
14
|
JSON,
|
|
15
15
|
BigInteger,
|
|
16
|
+
Boolean,
|
|
16
17
|
Column,
|
|
17
18
|
DateTime,
|
|
18
19
|
ForeignKey,
|
|
@@ -24,6 +25,7 @@ from sqlalchemy import (
|
|
|
24
25
|
)
|
|
25
26
|
from sqlalchemy.sql import func as f
|
|
26
27
|
|
|
28
|
+
from datachain.checkpoint import Checkpoint
|
|
27
29
|
from datachain.data_storage import JobQueryType, JobStatus
|
|
28
30
|
from datachain.data_storage.serializer import Serializable
|
|
29
31
|
from datachain.dataset import (
|
|
@@ -36,6 +38,7 @@ from datachain.dataset import (
|
|
|
36
38
|
StorageURI,
|
|
37
39
|
)
|
|
38
40
|
from datachain.error import (
|
|
41
|
+
CheckpointNotFoundError,
|
|
39
42
|
DatasetNotFoundError,
|
|
40
43
|
DatasetVersionNotFoundError,
|
|
41
44
|
NamespaceDeleteNotAllowedError,
|
|
@@ -75,6 +78,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
75
78
|
dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
|
|
76
79
|
dependency_class: type[DatasetDependency] = DatasetDependency
|
|
77
80
|
job_class: type[Job] = Job
|
|
81
|
+
checkpoint_class: type[Checkpoint] = Checkpoint
|
|
78
82
|
|
|
79
83
|
def __init__(
|
|
80
84
|
self,
|
|
@@ -431,6 +435,35 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
431
435
|
def get_job_status(self, job_id: str) -> Optional[JobStatus]:
|
|
432
436
|
"""Returns the status of the given job."""
|
|
433
437
|
|
|
438
|
+
#
|
|
439
|
+
# Checkpoints
|
|
440
|
+
#
|
|
441
|
+
|
|
442
|
+
@abstractmethod
|
|
443
|
+
def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
|
|
444
|
+
"""Returns all checkpoints related to some job"""
|
|
445
|
+
|
|
446
|
+
@abstractmethod
|
|
447
|
+
def get_checkpoint_by_id(self, checkpoint_id: str, conn=None) -> Checkpoint:
|
|
448
|
+
"""Gets single checkpoint by id"""
|
|
449
|
+
|
|
450
|
+
def find_checkpoint(
|
|
451
|
+
self, job_id: str, _hash: str, partial: bool = False, conn=None
|
|
452
|
+
) -> Optional[Checkpoint]:
|
|
453
|
+
"""
|
|
454
|
+
Tries to find checkpoint for a job with specific hash and optionally partial
|
|
455
|
+
"""
|
|
456
|
+
|
|
457
|
+
@abstractmethod
|
|
458
|
+
def create_checkpoint(
|
|
459
|
+
self,
|
|
460
|
+
job_id: str,
|
|
461
|
+
_hash: str,
|
|
462
|
+
partial: bool = False,
|
|
463
|
+
conn: Optional[Any] = None,
|
|
464
|
+
) -> Checkpoint:
|
|
465
|
+
"""Creates new checkpoint"""
|
|
466
|
+
|
|
434
467
|
|
|
435
468
|
class AbstractDBMetastore(AbstractMetastore):
|
|
436
469
|
"""
|
|
@@ -446,6 +479,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
446
479
|
DATASET_VERSION_TABLE = "datasets_versions"
|
|
447
480
|
DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
|
|
448
481
|
JOBS_TABLE = "jobs"
|
|
482
|
+
CHECKPOINTS_TABLE = "checkpoints"
|
|
449
483
|
|
|
450
484
|
db: "DatabaseEngine"
|
|
451
485
|
|
|
@@ -1663,3 +1697,106 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1663
1697
|
if not results:
|
|
1664
1698
|
return None
|
|
1665
1699
|
return results[0][0]
|
|
1700
|
+
|
|
1701
|
+
#
|
|
1702
|
+
# Checkpoints
|
|
1703
|
+
#
|
|
1704
|
+
|
|
1705
|
+
@staticmethod
|
|
1706
|
+
def _checkpoints_columns() -> "list[SchemaItem]":
|
|
1707
|
+
return [
|
|
1708
|
+
Column(
|
|
1709
|
+
"id",
|
|
1710
|
+
Text,
|
|
1711
|
+
default=uuid4,
|
|
1712
|
+
primary_key=True,
|
|
1713
|
+
nullable=False,
|
|
1714
|
+
),
|
|
1715
|
+
Column("job_id", Text, nullable=True),
|
|
1716
|
+
Column("hash", Text, nullable=False),
|
|
1717
|
+
Column("partial", Boolean, default=False),
|
|
1718
|
+
Column("created_at", DateTime(timezone=True), nullable=False),
|
|
1719
|
+
UniqueConstraint("job_id", "hash"),
|
|
1720
|
+
]
|
|
1721
|
+
|
|
1722
|
+
@cached_property
|
|
1723
|
+
def _checkpoints_fields(self) -> list[str]:
|
|
1724
|
+
return [c.name for c in self._checkpoints_columns() if c.name] # type: ignore[attr-defined]
|
|
1725
|
+
|
|
1726
|
+
@cached_property
|
|
1727
|
+
def _checkpoints(self) -> "Table":
|
|
1728
|
+
return Table(
|
|
1729
|
+
self.CHECKPOINTS_TABLE,
|
|
1730
|
+
self.db.metadata,
|
|
1731
|
+
*self._checkpoints_columns(),
|
|
1732
|
+
)
|
|
1733
|
+
|
|
1734
|
+
@abstractmethod
|
|
1735
|
+
def _checkpoints_insert(self) -> "Insert": ...
|
|
1736
|
+
|
|
1737
|
+
def _checkpoints_select(self, *columns) -> "Select":
|
|
1738
|
+
if not columns:
|
|
1739
|
+
return self._checkpoints.select()
|
|
1740
|
+
return select(*columns)
|
|
1741
|
+
|
|
1742
|
+
def _checkpoints_delete(self) -> "Delete":
|
|
1743
|
+
return self._checkpoints.delete()
|
|
1744
|
+
|
|
1745
|
+
def _checkpoints_query(self):
|
|
1746
|
+
return self._checkpoints_select(
|
|
1747
|
+
*[getattr(self._checkpoints.c, f) for f in self._checkpoints_fields]
|
|
1748
|
+
)
|
|
1749
|
+
|
|
1750
|
+
def create_checkpoint(
|
|
1751
|
+
self,
|
|
1752
|
+
job_id: str,
|
|
1753
|
+
_hash: str,
|
|
1754
|
+
partial: bool = False,
|
|
1755
|
+
conn: Optional[Any] = None,
|
|
1756
|
+
) -> Checkpoint:
|
|
1757
|
+
"""
|
|
1758
|
+
Creates a new job query step.
|
|
1759
|
+
"""
|
|
1760
|
+
checkpoint_id = str(uuid4())
|
|
1761
|
+
self.db.execute(
|
|
1762
|
+
self._checkpoints_insert().values(
|
|
1763
|
+
id=checkpoint_id,
|
|
1764
|
+
job_id=job_id,
|
|
1765
|
+
hash=_hash,
|
|
1766
|
+
partial=partial,
|
|
1767
|
+
created_at=datetime.now(timezone.utc),
|
|
1768
|
+
),
|
|
1769
|
+
conn=conn,
|
|
1770
|
+
)
|
|
1771
|
+
return self.get_checkpoint_by_id(checkpoint_id)
|
|
1772
|
+
|
|
1773
|
+
def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
|
|
1774
|
+
"""List checkpoints by job id."""
|
|
1775
|
+
query = self._checkpoints_query().where(self._checkpoints.c.job_id == job_id)
|
|
1776
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
1777
|
+
|
|
1778
|
+
yield from [self.checkpoint_class.parse(*r) for r in rows]
|
|
1779
|
+
|
|
1780
|
+
def get_checkpoint_by_id(self, checkpoint_id: str, conn=None) -> Checkpoint:
|
|
1781
|
+
"""Returns the checkpoint with the given ID."""
|
|
1782
|
+
ch = self._checkpoints
|
|
1783
|
+
query = self._checkpoints_select(ch).where(ch.c.id == checkpoint_id)
|
|
1784
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
1785
|
+
if not rows:
|
|
1786
|
+
raise CheckpointNotFoundError(f"Checkpoint {checkpoint_id} not found")
|
|
1787
|
+
return self.checkpoint_class.parse(*rows[0])
|
|
1788
|
+
|
|
1789
|
+
def find_checkpoint(
|
|
1790
|
+
self, job_id: str, _hash: str, partial: bool = False, conn=None
|
|
1791
|
+
) -> Optional[Checkpoint]:
|
|
1792
|
+
"""
|
|
1793
|
+
Tries to find checkpoint for a job with specific hash and optionally partial
|
|
1794
|
+
"""
|
|
1795
|
+
ch = self._checkpoints
|
|
1796
|
+
query = self._checkpoints_select(ch).where(
|
|
1797
|
+
ch.c.job_id == job_id, ch.c.hash == _hash, ch.c.partial == partial
|
|
1798
|
+
)
|
|
1799
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
1800
|
+
if not rows:
|
|
1801
|
+
return None
|
|
1802
|
+
return self.checkpoint_class.parse(*rows[0])
|
datachain/data_storage/schema.py
CHANGED
|
@@ -51,7 +51,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
|
51
51
|
"""
|
|
52
52
|
c_set: dict[str, sa.Column] = {}
|
|
53
53
|
for c in columns:
|
|
54
|
-
if (ec := c_set.get(c.name
|
|
54
|
+
if (ec := c_set.get(c.name)) is not None:
|
|
55
55
|
if str(ec.type) != str(c.type):
|
|
56
56
|
raise ValueError(
|
|
57
57
|
f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -459,6 +459,8 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
459
459
|
self.default_table_names.append(self._datasets_dependencies.name)
|
|
460
460
|
self.db.create_table(self._jobs, if_not_exists=True)
|
|
461
461
|
self.default_table_names.append(self._jobs.name)
|
|
462
|
+
self.db.create_table(self._checkpoints, if_not_exists=True)
|
|
463
|
+
self.default_table_names.append(self._checkpoints.name)
|
|
462
464
|
|
|
463
465
|
def _init_namespaces_projects(self) -> None:
|
|
464
466
|
"""
|
|
@@ -543,6 +545,12 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
543
545
|
def _jobs_insert(self) -> "Insert":
|
|
544
546
|
return sqlite.insert(self._jobs)
|
|
545
547
|
|
|
548
|
+
#
|
|
549
|
+
# Checkpoints
|
|
550
|
+
#
|
|
551
|
+
def _checkpoints_insert(self) -> "Insert":
|
|
552
|
+
return sqlite.insert(self._checkpoints)
|
|
553
|
+
|
|
546
554
|
#
|
|
547
555
|
# Namespaces
|
|
548
556
|
#
|
datachain/error.py
CHANGED
datachain/lib/dc/datachain.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
import hashlib
|
|
2
3
|
import os
|
|
3
4
|
import os.path
|
|
4
5
|
import sys
|
|
@@ -18,6 +19,7 @@ from typing import (
|
|
|
18
19
|
cast,
|
|
19
20
|
overload,
|
|
20
21
|
)
|
|
22
|
+
from uuid import uuid4
|
|
21
23
|
|
|
22
24
|
import sqlalchemy
|
|
23
25
|
import ujson as json
|
|
@@ -665,7 +667,7 @@ class DataChain:
|
|
|
665
667
|
name, namespace=namespace_name, project=project_name, **kwargs
|
|
666
668
|
)
|
|
667
669
|
|
|
668
|
-
|
|
670
|
+
result = self._evolve(
|
|
669
671
|
query=self._query.save(
|
|
670
672
|
name=name,
|
|
671
673
|
version=version,
|
|
@@ -678,6 +680,16 @@ class DataChain:
|
|
|
678
680
|
)
|
|
679
681
|
)
|
|
680
682
|
|
|
683
|
+
if job_id := os.getenv("DATACHAIN_JOB_ID"):
|
|
684
|
+
catalog.metastore.create_checkpoint(
|
|
685
|
+
job_id, # type: ignore[arg-type]
|
|
686
|
+
_hash=hashlib.sha256( # TODO this will be replaced with self.hash()
|
|
687
|
+
str(uuid4()).encode()
|
|
688
|
+
).hexdigest(),
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
return result
|
|
692
|
+
|
|
681
693
|
def apply(self, func, *args, **kwargs):
|
|
682
694
|
"""Apply any function to the chain.
|
|
683
695
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.33.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -86,6 +86,7 @@ Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
|
|
|
86
86
|
Provides-Extra: tests
|
|
87
87
|
Requires-Dist: datachain[audio,hf,postgres,remote,torch,vector,video]; extra == "tests"
|
|
88
88
|
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
89
|
+
Requires-Dist: pytest-asyncio; extra == "tests"
|
|
89
90
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
90
91
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
91
92
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
@@ -2,10 +2,11 @@ datachain/__init__.py,sha256=BRqfLPoBRRycnndaxyba-i4ZrZCJl0As2pwV9RiNBr8,1822
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
|
|
5
|
+
datachain/checkpoint.py,sha256=Ar6SnnDMN3fr5ZZm3Xpdbj2f9buhqeApad-B1Lyrr4Y,1152
|
|
5
6
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
7
|
datachain/dataset.py,sha256=eX7xGa3EUpAccBZWpkgDmYV6_FjGuhjkMLFHpjl6lVI,25256
|
|
7
8
|
datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
|
|
8
|
-
datachain/error.py,sha256=
|
|
9
|
+
datachain/error.py,sha256=WR1MoO9BPI0hO1FVKVTS0hgyxxumywtDnSY7Sv1oE1c,1796
|
|
9
10
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
10
11
|
datachain/listing.py,sha256=aqayl5St3D9PwdwM6nR1STkpLSw-S3U8pudO9PWi3N8,7241
|
|
11
12
|
datachain/namespace.py,sha256=sgIF90KEaC_VlMFivDIJiFz8RUsTftMxW4kOUTyxo3A,2356
|
|
@@ -41,18 +42,19 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
|
|
|
41
42
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
42
43
|
datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
|
|
43
44
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
44
|
-
datachain/client/fsspec.py,sha256=
|
|
45
|
+
datachain/client/fsspec.py,sha256=urt-b9Osay-S4LmwyXUKyYp-JHUBlFewoUvYNP7W_Jw,14553
|
|
45
46
|
datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
|
|
46
47
|
datachain/client/hf.py,sha256=n5xJZdvNLS-SqokxuBCIPfGbhIeC_XfLm_BNYtEVvg4,2677
|
|
48
|
+
datachain/client/http.py,sha256=oU4nxaOa3xNXkxprDjjIS5fufgRJS0eNHTau3FUC6sg,5171
|
|
47
49
|
datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,4607
|
|
48
50
|
datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
|
|
49
51
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
50
52
|
datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
|
|
51
53
|
datachain/data_storage/job.py,sha256=ZkeXCNUj_VCkoKYx29hqB4AcfVUielnRjY-GYUcUxt4,426
|
|
52
|
-
datachain/data_storage/metastore.py,sha256=
|
|
53
|
-
datachain/data_storage/schema.py,sha256=
|
|
54
|
+
datachain/data_storage/metastore.py,sha256=TgLYAKraH1WsmteaAqO5TW2VzNZZM4_SASgcBlDzdr8,60218
|
|
55
|
+
datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
|
|
54
56
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
55
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
57
|
+
datachain/data_storage/sqlite.py,sha256=Z6KlFk7hWoXBbjzxfk2NuIBecqP86AJzp5iEE2W4yw0,30603
|
|
56
58
|
datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
|
|
57
59
|
datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
|
|
58
60
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -104,7 +106,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
|
|
|
104
106
|
datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
|
|
105
107
|
datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
|
|
106
108
|
datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
|
|
107
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
109
|
+
datachain/lib/dc/datachain.py,sha256=1LvKFKqAWw8TMw2bdpfG6LfOCMMgBS6bluBp0lCX0s4,100845
|
|
108
110
|
datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
|
|
109
111
|
datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
|
|
110
112
|
datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
|
|
@@ -161,9 +163,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
161
163
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
162
164
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
163
165
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
164
|
-
datachain-0.
|
|
165
|
-
datachain-0.
|
|
166
|
-
datachain-0.
|
|
167
|
-
datachain-0.
|
|
168
|
-
datachain-0.
|
|
169
|
-
datachain-0.
|
|
166
|
+
datachain-0.33.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
167
|
+
datachain-0.33.0.dist-info/METADATA,sha256=UGH-boSaU6Kaz6RIsQItwQe4Auzl6L4oHSeeNCKZ7pw,13655
|
|
168
|
+
datachain-0.33.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
169
|
+
datachain-0.33.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
170
|
+
datachain-0.33.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
171
|
+
datachain-0.33.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|