datachain 0.32.3__py3-none-any.whl → 0.33.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +58 -22
- datachain/checkpoint.py +44 -0
- datachain/client/fsspec.py +6 -1
- datachain/client/http.py +157 -0
- datachain/data_storage/job.py +1 -0
- datachain/data_storage/metastore.py +137 -0
- datachain/data_storage/schema.py +1 -1
- datachain/data_storage/sqlite.py +8 -0
- datachain/diff/__init__.py +7 -13
- datachain/error.py +4 -0
- datachain/hash_utils.py +147 -0
- datachain/lib/dc/datachain.py +21 -1
- datachain/lib/signal_schema.py +7 -0
- datachain/lib/udf.py +20 -0
- datachain/query/dataset.py +107 -0
- datachain/utils.py +6 -0
- {datachain-0.32.3.dist-info → datachain-0.33.1.dist-info}/METADATA +3 -2
- {datachain-0.32.3.dist-info → datachain-0.33.1.dist-info}/RECORD +22 -19
- {datachain-0.32.3.dist-info → datachain-0.33.1.dist-info}/WHEEL +0 -0
- {datachain-0.32.3.dist-info → datachain-0.33.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.32.3.dist-info → datachain-0.33.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.32.3.dist-info → datachain-0.33.1.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -144,19 +144,26 @@ def shutdown_process(
|
|
|
144
144
|
return proc.wait()
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def
|
|
147
|
+
def process_output(stream: IO[bytes], callback: Callable[[str], None]) -> None:
|
|
148
148
|
buffer = b""
|
|
149
|
-
while byt := stream.read(1): # Read one byte at a time
|
|
150
|
-
buffer += byt
|
|
151
149
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
buffer = b"" # Clear buffer for next line
|
|
150
|
+
try:
|
|
151
|
+
while byt := stream.read(1): # Read one byte at a time
|
|
152
|
+
buffer += byt
|
|
156
153
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
154
|
+
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
155
|
+
line = buffer.decode("utf-8", errors="replace")
|
|
156
|
+
callback(line)
|
|
157
|
+
buffer = b"" # Clear buffer for the next line
|
|
158
|
+
|
|
159
|
+
if buffer: # Handle any remaining data in the buffer
|
|
160
|
+
line = buffer.decode("utf-8", errors="replace")
|
|
161
|
+
callback(line)
|
|
162
|
+
finally:
|
|
163
|
+
try:
|
|
164
|
+
stream.close() # Ensure output is closed
|
|
165
|
+
except Exception: # noqa: BLE001, S110
|
|
166
|
+
pass
|
|
160
167
|
|
|
161
168
|
|
|
162
169
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -1760,13 +1767,13 @@ class Catalog:
|
|
|
1760
1767
|
recursive=recursive,
|
|
1761
1768
|
)
|
|
1762
1769
|
|
|
1770
|
+
@staticmethod
|
|
1763
1771
|
def query(
|
|
1764
|
-
self,
|
|
1765
1772
|
query_script: str,
|
|
1766
1773
|
env: Optional[Mapping[str, str]] = None,
|
|
1767
1774
|
python_executable: str = sys.executable,
|
|
1768
|
-
|
|
1769
|
-
|
|
1775
|
+
stdout_callback: Optional[Callable[[str], None]] = None,
|
|
1776
|
+
stderr_callback: Optional[Callable[[str], None]] = None,
|
|
1770
1777
|
params: Optional[dict[str, str]] = None,
|
|
1771
1778
|
job_id: Optional[str] = None,
|
|
1772
1779
|
interrupt_timeout: Optional[int] = None,
|
|
@@ -1781,13 +1788,18 @@ class Catalog:
|
|
|
1781
1788
|
},
|
|
1782
1789
|
)
|
|
1783
1790
|
popen_kwargs: dict[str, Any] = {}
|
|
1784
|
-
|
|
1785
|
-
|
|
1791
|
+
|
|
1792
|
+
if stdout_callback is not None:
|
|
1793
|
+
popen_kwargs = {"stdout": subprocess.PIPE}
|
|
1794
|
+
if stderr_callback is not None:
|
|
1795
|
+
popen_kwargs["stderr"] = subprocess.PIPE
|
|
1786
1796
|
|
|
1787
1797
|
def raise_termination_signal(sig: int, _: Any) -> NoReturn:
|
|
1788
1798
|
raise TerminationSignal(sig)
|
|
1789
1799
|
|
|
1790
|
-
|
|
1800
|
+
stdout_thread: Optional[Thread] = None
|
|
1801
|
+
stderr_thread: Optional[Thread] = None
|
|
1802
|
+
|
|
1791
1803
|
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
|
|
1792
1804
|
logger.info("Starting process %s", proc.pid)
|
|
1793
1805
|
|
|
@@ -1801,10 +1813,20 @@ class Catalog:
|
|
|
1801
1813
|
orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
1802
1814
|
signal.signal(signal.SIGTERM, raise_termination_signal)
|
|
1803
1815
|
try:
|
|
1804
|
-
if
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1816
|
+
if stdout_callback is not None:
|
|
1817
|
+
stdout_thread = Thread(
|
|
1818
|
+
target=process_output,
|
|
1819
|
+
args=(proc.stdout, stdout_callback),
|
|
1820
|
+
daemon=True,
|
|
1821
|
+
)
|
|
1822
|
+
stdout_thread.start()
|
|
1823
|
+
if stderr_callback is not None:
|
|
1824
|
+
stderr_thread = Thread(
|
|
1825
|
+
target=process_output,
|
|
1826
|
+
args=(proc.stderr, stderr_callback),
|
|
1827
|
+
daemon=True,
|
|
1828
|
+
)
|
|
1829
|
+
stderr_thread.start()
|
|
1808
1830
|
|
|
1809
1831
|
proc.wait()
|
|
1810
1832
|
except TerminationSignal as exc:
|
|
@@ -1822,8 +1844,22 @@ class Catalog:
|
|
|
1822
1844
|
finally:
|
|
1823
1845
|
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1824
1846
|
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1825
|
-
|
|
1826
|
-
|
|
1847
|
+
# wait for the reader thread
|
|
1848
|
+
thread_join_timeout_seconds = 30
|
|
1849
|
+
if stdout_thread is not None:
|
|
1850
|
+
stdout_thread.join(timeout=thread_join_timeout_seconds)
|
|
1851
|
+
if stdout_thread.is_alive():
|
|
1852
|
+
logger.warning(
|
|
1853
|
+
"stdout thread is still alive after %s seconds",
|
|
1854
|
+
thread_join_timeout_seconds,
|
|
1855
|
+
)
|
|
1856
|
+
if stderr_thread is not None:
|
|
1857
|
+
stderr_thread.join(timeout=thread_join_timeout_seconds)
|
|
1858
|
+
if stderr_thread.is_alive():
|
|
1859
|
+
logger.warning(
|
|
1860
|
+
"stderr thread is still alive after %s seconds",
|
|
1861
|
+
thread_join_timeout_seconds,
|
|
1862
|
+
)
|
|
1827
1863
|
|
|
1828
1864
|
logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1829
1865
|
if proc.returncode in (
|
datachain/checkpoint.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class Checkpoint:
|
|
9
|
+
"""
|
|
10
|
+
Represents a checkpoint within a job run.
|
|
11
|
+
|
|
12
|
+
A checkpoint marks a successfully completed stage of execution. In the event
|
|
13
|
+
of a failure, the job can resume from the most recent checkpoint rather than
|
|
14
|
+
starting over from the beginning.
|
|
15
|
+
|
|
16
|
+
Checkpoints can also be created in a "partial" mode, which indicates that the
|
|
17
|
+
work at this stage was only partially completed. For example, if a failure
|
|
18
|
+
occurs halfway through running a UDF, already computed results can still be
|
|
19
|
+
saved, allowing the job to resume from that partially completed state on
|
|
20
|
+
restart.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
id: str
|
|
24
|
+
job_id: str
|
|
25
|
+
hash: str
|
|
26
|
+
partial: bool
|
|
27
|
+
created_at: datetime
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def parse(
|
|
31
|
+
cls,
|
|
32
|
+
id: Union[str, uuid.UUID],
|
|
33
|
+
job_id: str,
|
|
34
|
+
_hash: str,
|
|
35
|
+
partial: bool,
|
|
36
|
+
created_at: datetime,
|
|
37
|
+
) -> "Checkpoint":
|
|
38
|
+
return cls(
|
|
39
|
+
str(id),
|
|
40
|
+
job_id,
|
|
41
|
+
_hash,
|
|
42
|
+
bool(partial),
|
|
43
|
+
created_at,
|
|
44
|
+
)
|
datachain/client/fsspec.py
CHANGED
|
@@ -93,10 +93,11 @@ class Client(ABC):
|
|
|
93
93
|
self.uri = self.get_uri(self.name)
|
|
94
94
|
|
|
95
95
|
@staticmethod
|
|
96
|
-
def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
|
|
96
|
+
def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]: # noqa: PLR0911
|
|
97
97
|
from .azure import AzureClient
|
|
98
98
|
from .gcs import GCSClient
|
|
99
99
|
from .hf import HfClient
|
|
100
|
+
from .http import HTTPClient, HTTPSClient
|
|
100
101
|
from .local import FileClient
|
|
101
102
|
from .s3 import ClientS3
|
|
102
103
|
|
|
@@ -114,6 +115,10 @@ class Client(ABC):
|
|
|
114
115
|
return FileClient
|
|
115
116
|
if protocol == HfClient.protocol:
|
|
116
117
|
return HfClient
|
|
118
|
+
if protocol == HTTPClient.protocol:
|
|
119
|
+
return HTTPClient
|
|
120
|
+
if protocol == HTTPSClient.protocol:
|
|
121
|
+
return HTTPSClient
|
|
117
122
|
|
|
118
123
|
raise NotImplementedError(f"Unsupported protocol: {protocol}")
|
|
119
124
|
|
datachain/client/http.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Optional, cast
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
from fsspec.implementations.http import HTTPFileSystem
|
|
6
|
+
|
|
7
|
+
from datachain.dataset import StorageURI
|
|
8
|
+
from datachain.lib.file import File
|
|
9
|
+
|
|
10
|
+
from .fsspec import Client
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from datachain.cache import Cache
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HTTPClient(Client):
|
|
17
|
+
FS_CLASS = HTTPFileSystem
|
|
18
|
+
PREFIX: ClassVar[str] = "http://"
|
|
19
|
+
protocol: ClassVar[str] = "http"
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def create_fs(cls, **kwargs) -> HTTPFileSystem:
|
|
23
|
+
# Configure HTTPFileSystem options
|
|
24
|
+
kwargs.setdefault("simple_links", True)
|
|
25
|
+
kwargs.setdefault("same_scheme", True)
|
|
26
|
+
kwargs.setdefault("cache_type", "bytes")
|
|
27
|
+
|
|
28
|
+
kwargs.pop("version_aware", None)
|
|
29
|
+
|
|
30
|
+
fs = cls.FS_CLASS(**kwargs)
|
|
31
|
+
fs.invalidate_cache()
|
|
32
|
+
return cast("HTTPFileSystem", fs)
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_name(
|
|
36
|
+
cls,
|
|
37
|
+
name: str,
|
|
38
|
+
cache: "Cache",
|
|
39
|
+
kwargs: dict[str, Any],
|
|
40
|
+
) -> "HTTPClient":
|
|
41
|
+
parsed = urlparse(name)
|
|
42
|
+
|
|
43
|
+
if parsed.scheme:
|
|
44
|
+
name = parsed.netloc + parsed.path
|
|
45
|
+
|
|
46
|
+
return cls(name, kwargs, cache)
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def split_url(cls, url: str) -> tuple[str, str]:
|
|
50
|
+
"""Split HTTP/HTTPS URL into domain (bucket equivalent) and path."""
|
|
51
|
+
parsed = urlparse(url)
|
|
52
|
+
domain = parsed.netloc
|
|
53
|
+
path = parsed.path.lstrip("/")
|
|
54
|
+
|
|
55
|
+
if parsed.query:
|
|
56
|
+
path += f"?{parsed.query}"
|
|
57
|
+
if parsed.fragment:
|
|
58
|
+
path += f"#{parsed.fragment}"
|
|
59
|
+
|
|
60
|
+
return domain, path
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def get_uri(cls, name: str) -> "StorageURI":
|
|
64
|
+
if not name.startswith(("http://", "https://")):
|
|
65
|
+
return StorageURI(f"{cls.PREFIX}{name}")
|
|
66
|
+
return StorageURI(name)
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def is_root_url(cls, url: str) -> bool:
|
|
70
|
+
parsed = urlparse(url)
|
|
71
|
+
return parsed.path in ("", "/") and not parsed.query and not parsed.fragment
|
|
72
|
+
|
|
73
|
+
def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
|
|
74
|
+
if self.name.startswith(("http://", "https://")):
|
|
75
|
+
base_url = self.name
|
|
76
|
+
else:
|
|
77
|
+
if rel_path and "/" in rel_path:
|
|
78
|
+
first_part = rel_path.split("/")[0]
|
|
79
|
+
if "." in first_part and not first_part.startswith("."):
|
|
80
|
+
return f"{self.protocol}://{rel_path}"
|
|
81
|
+
|
|
82
|
+
base_url = f"{self.protocol}://{self.name}"
|
|
83
|
+
|
|
84
|
+
if rel_path:
|
|
85
|
+
if not base_url.endswith("/") and not rel_path.startswith("/"):
|
|
86
|
+
base_url += "/"
|
|
87
|
+
full_url = base_url + rel_path
|
|
88
|
+
else:
|
|
89
|
+
full_url = base_url
|
|
90
|
+
|
|
91
|
+
return full_url
|
|
92
|
+
|
|
93
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Generate URL for the given path.
|
|
96
|
+
Note: HTTP URLs don't support signed/expiring URLs.
|
|
97
|
+
"""
|
|
98
|
+
return self.get_full_path(path, kwargs.pop("version_id", None))
|
|
99
|
+
|
|
100
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
101
|
+
etag = v.get("ETag", "").strip('"')
|
|
102
|
+
last_modified = v.get("last_modified")
|
|
103
|
+
if last_modified:
|
|
104
|
+
if isinstance(last_modified, str):
|
|
105
|
+
try:
|
|
106
|
+
from email.utils import parsedate_to_datetime
|
|
107
|
+
|
|
108
|
+
last_modified = parsedate_to_datetime(last_modified)
|
|
109
|
+
except (ValueError, TypeError):
|
|
110
|
+
last_modified = datetime.now(timezone.utc)
|
|
111
|
+
elif isinstance(last_modified, (int, float)):
|
|
112
|
+
last_modified = datetime.fromtimestamp(last_modified, timezone.utc)
|
|
113
|
+
else:
|
|
114
|
+
last_modified = datetime.now(timezone.utc)
|
|
115
|
+
|
|
116
|
+
return File(
|
|
117
|
+
source=self.uri,
|
|
118
|
+
path=path,
|
|
119
|
+
size=v.get("size", 0),
|
|
120
|
+
etag=etag,
|
|
121
|
+
version="",
|
|
122
|
+
is_latest=True,
|
|
123
|
+
last_modified=last_modified,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def upload(self, data: bytes, path: str) -> "File":
|
|
127
|
+
raise NotImplementedError(
|
|
128
|
+
"HTTP/HTTPS client is read-only. Upload operations are not supported."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
|
|
132
|
+
info = self.fs.info(self.get_full_path(path))
|
|
133
|
+
return self.info_to_file(info, path)
|
|
134
|
+
|
|
135
|
+
def open_object(self, file: "File", use_cache: bool = True, cb=None):
|
|
136
|
+
from datachain.client.fileslice import FileWrapper
|
|
137
|
+
|
|
138
|
+
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
139
|
+
return open(cache_path, mode="rb")
|
|
140
|
+
|
|
141
|
+
assert not file.location
|
|
142
|
+
return FileWrapper(
|
|
143
|
+
self.fs.open(self.get_full_path(file.get_path_normalized())),
|
|
144
|
+
cb or (lambda x: None),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
|
|
148
|
+
return await self.fs._get_file(lpath, rpath, callback=callback)
|
|
149
|
+
|
|
150
|
+
async def _fetch_dir(self, prefix: str, pbar, result_queue) -> set[str]:
|
|
151
|
+
full_url = self.get_full_path(prefix)
|
|
152
|
+
raise NotImplementedError(f"Cannot download file from {full_url}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class HTTPSClient(HTTPClient):
|
|
156
|
+
protocol = "https"
|
|
157
|
+
PREFIX = "https://"
|
datachain/data_storage/job.py
CHANGED
|
@@ -13,6 +13,7 @@ from uuid import uuid4
|
|
|
13
13
|
from sqlalchemy import (
|
|
14
14
|
JSON,
|
|
15
15
|
BigInteger,
|
|
16
|
+
Boolean,
|
|
16
17
|
Column,
|
|
17
18
|
DateTime,
|
|
18
19
|
ForeignKey,
|
|
@@ -24,6 +25,7 @@ from sqlalchemy import (
|
|
|
24
25
|
)
|
|
25
26
|
from sqlalchemy.sql import func as f
|
|
26
27
|
|
|
28
|
+
from datachain.checkpoint import Checkpoint
|
|
27
29
|
from datachain.data_storage import JobQueryType, JobStatus
|
|
28
30
|
from datachain.data_storage.serializer import Serializable
|
|
29
31
|
from datachain.dataset import (
|
|
@@ -36,6 +38,7 @@ from datachain.dataset import (
|
|
|
36
38
|
StorageURI,
|
|
37
39
|
)
|
|
38
40
|
from datachain.error import (
|
|
41
|
+
CheckpointNotFoundError,
|
|
39
42
|
DatasetNotFoundError,
|
|
40
43
|
DatasetVersionNotFoundError,
|
|
41
44
|
NamespaceDeleteNotAllowedError,
|
|
@@ -75,6 +78,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
75
78
|
dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
|
|
76
79
|
dependency_class: type[DatasetDependency] = DatasetDependency
|
|
77
80
|
job_class: type[Job] = Job
|
|
81
|
+
checkpoint_class: type[Checkpoint] = Checkpoint
|
|
78
82
|
|
|
79
83
|
def __init__(
|
|
80
84
|
self,
|
|
@@ -431,6 +435,35 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
431
435
|
def get_job_status(self, job_id: str) -> Optional[JobStatus]:
|
|
432
436
|
"""Returns the status of the given job."""
|
|
433
437
|
|
|
438
|
+
#
|
|
439
|
+
# Checkpoints
|
|
440
|
+
#
|
|
441
|
+
|
|
442
|
+
@abstractmethod
|
|
443
|
+
def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
|
|
444
|
+
"""Returns all checkpoints related to some job"""
|
|
445
|
+
|
|
446
|
+
@abstractmethod
|
|
447
|
+
def get_checkpoint_by_id(self, checkpoint_id: str, conn=None) -> Checkpoint:
|
|
448
|
+
"""Gets single checkpoint by id"""
|
|
449
|
+
|
|
450
|
+
def find_checkpoint(
|
|
451
|
+
self, job_id: str, _hash: str, partial: bool = False, conn=None
|
|
452
|
+
) -> Optional[Checkpoint]:
|
|
453
|
+
"""
|
|
454
|
+
Tries to find checkpoint for a job with specific hash and optionally partial
|
|
455
|
+
"""
|
|
456
|
+
|
|
457
|
+
@abstractmethod
|
|
458
|
+
def create_checkpoint(
|
|
459
|
+
self,
|
|
460
|
+
job_id: str,
|
|
461
|
+
_hash: str,
|
|
462
|
+
partial: bool = False,
|
|
463
|
+
conn: Optional[Any] = None,
|
|
464
|
+
) -> Checkpoint:
|
|
465
|
+
"""Creates new checkpoint"""
|
|
466
|
+
|
|
434
467
|
|
|
435
468
|
class AbstractDBMetastore(AbstractMetastore):
|
|
436
469
|
"""
|
|
@@ -446,6 +479,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
446
479
|
DATASET_VERSION_TABLE = "datasets_versions"
|
|
447
480
|
DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
|
|
448
481
|
JOBS_TABLE = "jobs"
|
|
482
|
+
CHECKPOINTS_TABLE = "checkpoints"
|
|
449
483
|
|
|
450
484
|
db: "DatabaseEngine"
|
|
451
485
|
|
|
@@ -1663,3 +1697,106 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1663
1697
|
if not results:
|
|
1664
1698
|
return None
|
|
1665
1699
|
return results[0][0]
|
|
1700
|
+
|
|
1701
|
+
#
|
|
1702
|
+
# Checkpoints
|
|
1703
|
+
#
|
|
1704
|
+
|
|
1705
|
+
@staticmethod
|
|
1706
|
+
def _checkpoints_columns() -> "list[SchemaItem]":
|
|
1707
|
+
return [
|
|
1708
|
+
Column(
|
|
1709
|
+
"id",
|
|
1710
|
+
Text,
|
|
1711
|
+
default=uuid4,
|
|
1712
|
+
primary_key=True,
|
|
1713
|
+
nullable=False,
|
|
1714
|
+
),
|
|
1715
|
+
Column("job_id", Text, nullable=True),
|
|
1716
|
+
Column("hash", Text, nullable=False),
|
|
1717
|
+
Column("partial", Boolean, default=False),
|
|
1718
|
+
Column("created_at", DateTime(timezone=True), nullable=False),
|
|
1719
|
+
UniqueConstraint("job_id", "hash"),
|
|
1720
|
+
]
|
|
1721
|
+
|
|
1722
|
+
@cached_property
|
|
1723
|
+
def _checkpoints_fields(self) -> list[str]:
|
|
1724
|
+
return [c.name for c in self._checkpoints_columns() if c.name] # type: ignore[attr-defined]
|
|
1725
|
+
|
|
1726
|
+
@cached_property
|
|
1727
|
+
def _checkpoints(self) -> "Table":
|
|
1728
|
+
return Table(
|
|
1729
|
+
self.CHECKPOINTS_TABLE,
|
|
1730
|
+
self.db.metadata,
|
|
1731
|
+
*self._checkpoints_columns(),
|
|
1732
|
+
)
|
|
1733
|
+
|
|
1734
|
+
@abstractmethod
|
|
1735
|
+
def _checkpoints_insert(self) -> "Insert": ...
|
|
1736
|
+
|
|
1737
|
+
def _checkpoints_select(self, *columns) -> "Select":
|
|
1738
|
+
if not columns:
|
|
1739
|
+
return self._checkpoints.select()
|
|
1740
|
+
return select(*columns)
|
|
1741
|
+
|
|
1742
|
+
def _checkpoints_delete(self) -> "Delete":
|
|
1743
|
+
return self._checkpoints.delete()
|
|
1744
|
+
|
|
1745
|
+
def _checkpoints_query(self):
|
|
1746
|
+
return self._checkpoints_select(
|
|
1747
|
+
*[getattr(self._checkpoints.c, f) for f in self._checkpoints_fields]
|
|
1748
|
+
)
|
|
1749
|
+
|
|
1750
|
+
def create_checkpoint(
|
|
1751
|
+
self,
|
|
1752
|
+
job_id: str,
|
|
1753
|
+
_hash: str,
|
|
1754
|
+
partial: bool = False,
|
|
1755
|
+
conn: Optional[Any] = None,
|
|
1756
|
+
) -> Checkpoint:
|
|
1757
|
+
"""
|
|
1758
|
+
Creates a new job query step.
|
|
1759
|
+
"""
|
|
1760
|
+
checkpoint_id = str(uuid4())
|
|
1761
|
+
self.db.execute(
|
|
1762
|
+
self._checkpoints_insert().values(
|
|
1763
|
+
id=checkpoint_id,
|
|
1764
|
+
job_id=job_id,
|
|
1765
|
+
hash=_hash,
|
|
1766
|
+
partial=partial,
|
|
1767
|
+
created_at=datetime.now(timezone.utc),
|
|
1768
|
+
),
|
|
1769
|
+
conn=conn,
|
|
1770
|
+
)
|
|
1771
|
+
return self.get_checkpoint_by_id(checkpoint_id)
|
|
1772
|
+
|
|
1773
|
+
def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
|
|
1774
|
+
"""List checkpoints by job id."""
|
|
1775
|
+
query = self._checkpoints_query().where(self._checkpoints.c.job_id == job_id)
|
|
1776
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
1777
|
+
|
|
1778
|
+
yield from [self.checkpoint_class.parse(*r) for r in rows]
|
|
1779
|
+
|
|
1780
|
+
def get_checkpoint_by_id(self, checkpoint_id: str, conn=None) -> Checkpoint:
|
|
1781
|
+
"""Returns the checkpoint with the given ID."""
|
|
1782
|
+
ch = self._checkpoints
|
|
1783
|
+
query = self._checkpoints_select(ch).where(ch.c.id == checkpoint_id)
|
|
1784
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
1785
|
+
if not rows:
|
|
1786
|
+
raise CheckpointNotFoundError(f"Checkpoint {checkpoint_id} not found")
|
|
1787
|
+
return self.checkpoint_class.parse(*rows[0])
|
|
1788
|
+
|
|
1789
|
+
def find_checkpoint(
|
|
1790
|
+
self, job_id: str, _hash: str, partial: bool = False, conn=None
|
|
1791
|
+
) -> Optional[Checkpoint]:
|
|
1792
|
+
"""
|
|
1793
|
+
Tries to find checkpoint for a job with specific hash and optionally partial
|
|
1794
|
+
"""
|
|
1795
|
+
ch = self._checkpoints
|
|
1796
|
+
query = self._checkpoints_select(ch).where(
|
|
1797
|
+
ch.c.job_id == job_id, ch.c.hash == _hash, ch.c.partial == partial
|
|
1798
|
+
)
|
|
1799
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
1800
|
+
if not rows:
|
|
1801
|
+
return None
|
|
1802
|
+
return self.checkpoint_class.parse(*rows[0])
|
datachain/data_storage/schema.py
CHANGED
|
@@ -51,7 +51,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
|
51
51
|
"""
|
|
52
52
|
c_set: dict[str, sa.Column] = {}
|
|
53
53
|
for c in columns:
|
|
54
|
-
if (ec := c_set.get(c.name
|
|
54
|
+
if (ec := c_set.get(c.name)) is not None:
|
|
55
55
|
if str(ec.type) != str(c.type):
|
|
56
56
|
raise ValueError(
|
|
57
57
|
f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -459,6 +459,8 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
459
459
|
self.default_table_names.append(self._datasets_dependencies.name)
|
|
460
460
|
self.db.create_table(self._jobs, if_not_exists=True)
|
|
461
461
|
self.default_table_names.append(self._jobs.name)
|
|
462
|
+
self.db.create_table(self._checkpoints, if_not_exists=True)
|
|
463
|
+
self.default_table_names.append(self._checkpoints.name)
|
|
462
464
|
|
|
463
465
|
def _init_namespaces_projects(self) -> None:
|
|
464
466
|
"""
|
|
@@ -543,6 +545,12 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
543
545
|
def _jobs_insert(self) -> "Insert":
|
|
544
546
|
return sqlite.insert(self._jobs)
|
|
545
547
|
|
|
548
|
+
#
|
|
549
|
+
# Checkpoints
|
|
550
|
+
#
|
|
551
|
+
def _checkpoints_insert(self) -> "Insert":
|
|
552
|
+
return sqlite.insert(self._checkpoints)
|
|
553
|
+
|
|
546
554
|
#
|
|
547
555
|
# Namespaces
|
|
548
556
|
#
|
datachain/diff/__init__.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import string
|
|
3
1
|
from collections.abc import Sequence
|
|
4
2
|
from enum import Enum
|
|
5
3
|
from typing import TYPE_CHECKING, Optional, Union
|
|
@@ -11,16 +9,12 @@ from datachain.query.schema import Column
|
|
|
11
9
|
if TYPE_CHECKING:
|
|
12
10
|
from datachain.lib.dc import DataChain
|
|
13
11
|
|
|
14
|
-
|
|
15
12
|
C = Column
|
|
16
13
|
|
|
17
14
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
random.choice(string.ascii_letters) # noqa: S311
|
|
22
|
-
for _ in range(10)
|
|
23
|
-
)
|
|
15
|
+
STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
|
|
16
|
+
LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
|
|
17
|
+
RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
|
|
24
18
|
|
|
25
19
|
|
|
26
20
|
class CompareStatus(str, Enum):
|
|
@@ -101,9 +95,9 @@ def _compare( # noqa: C901, PLR0912
|
|
|
101
95
|
compare = right_compare = [c for c in cols if c in right_cols and c not in on] # type: ignore[misc]
|
|
102
96
|
|
|
103
97
|
# get diff column names
|
|
104
|
-
diff_col = status_col or
|
|
105
|
-
ldiff_col =
|
|
106
|
-
rdiff_col =
|
|
98
|
+
diff_col = status_col or STATUS_COL_NAME
|
|
99
|
+
ldiff_col = LEFT_DIFF_COL_NAME
|
|
100
|
+
rdiff_col = RIGHT_DIFF_COL_NAME
|
|
107
101
|
|
|
108
102
|
# adding helper diff columns, which will be removed after
|
|
109
103
|
left = left.mutate(**{ldiff_col: 1})
|
|
@@ -227,7 +221,7 @@ def compare_and_split(
|
|
|
227
221
|
)
|
|
228
222
|
```
|
|
229
223
|
"""
|
|
230
|
-
status_col =
|
|
224
|
+
status_col = STATUS_COL_NAME
|
|
231
225
|
|
|
232
226
|
res = _compare(
|
|
233
227
|
left,
|
datachain/error.py
CHANGED
datachain/hash_utils.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import inspect
|
|
3
|
+
import json
|
|
4
|
+
import textwrap
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
from typing import TypeVar, Union
|
|
7
|
+
|
|
8
|
+
from sqlalchemy.sql.elements import (
|
|
9
|
+
BinaryExpression,
|
|
10
|
+
BindParameter,
|
|
11
|
+
ColumnElement,
|
|
12
|
+
Label,
|
|
13
|
+
Over,
|
|
14
|
+
UnaryExpression,
|
|
15
|
+
)
|
|
16
|
+
from sqlalchemy.sql.functions import Function
|
|
17
|
+
|
|
18
|
+
T = TypeVar("T", bound=ColumnElement)
|
|
19
|
+
ColumnLike = Union[str, T]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def serialize_column_element(expr: Union[str, ColumnElement]) -> dict: # noqa: PLR0911
|
|
23
|
+
"""
|
|
24
|
+
Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
# Binary operations: col > 5, col1 + col2, etc.
|
|
28
|
+
if isinstance(expr, BinaryExpression):
|
|
29
|
+
op = (
|
|
30
|
+
expr.operator.__name__
|
|
31
|
+
if hasattr(expr.operator, "__name__")
|
|
32
|
+
else str(expr.operator)
|
|
33
|
+
)
|
|
34
|
+
return {
|
|
35
|
+
"type": "binary",
|
|
36
|
+
"op": op,
|
|
37
|
+
"left": serialize_column_element(expr.left),
|
|
38
|
+
"right": serialize_column_element(expr.right),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# Unary operations: -col, NOT col, etc.
|
|
42
|
+
if isinstance(expr, UnaryExpression):
|
|
43
|
+
op = (
|
|
44
|
+
expr.operator.__name__
|
|
45
|
+
if expr.operator is not None and hasattr(expr.operator, "__name__")
|
|
46
|
+
else str(expr.operator)
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
"type": "unary",
|
|
51
|
+
"op": op,
|
|
52
|
+
"element": serialize_column_element(expr.element), # type: ignore[arg-type]
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Function calls: func.lower(col), func.count(col), etc.
|
|
56
|
+
if isinstance(expr, Function):
|
|
57
|
+
return {
|
|
58
|
+
"type": "function",
|
|
59
|
+
"name": expr.name,
|
|
60
|
+
"clauses": [serialize_column_element(c) for c in expr.clauses],
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Window functions: func.row_number().over(partition_by=..., order_by=...)
|
|
64
|
+
if isinstance(expr, Over):
|
|
65
|
+
return {
|
|
66
|
+
"type": "window",
|
|
67
|
+
"function": serialize_column_element(expr.element),
|
|
68
|
+
"partition_by": [
|
|
69
|
+
serialize_column_element(p) for p in getattr(expr, "partition_by", [])
|
|
70
|
+
],
|
|
71
|
+
"order_by": [
|
|
72
|
+
serialize_column_element(o) for o in getattr(expr, "order_by", [])
|
|
73
|
+
],
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Labeled expressions: col.label("alias")
|
|
77
|
+
if isinstance(expr, Label):
|
|
78
|
+
return {
|
|
79
|
+
"type": "label",
|
|
80
|
+
"name": expr.name,
|
|
81
|
+
"element": serialize_column_element(expr.element),
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Bound values (constants)
|
|
85
|
+
if isinstance(expr, BindParameter):
|
|
86
|
+
return {"type": "bind", "value": expr.value}
|
|
87
|
+
|
|
88
|
+
# Plain columns
|
|
89
|
+
if hasattr(expr, "name"):
|
|
90
|
+
return {"type": "column", "name": expr.name}
|
|
91
|
+
|
|
92
|
+
# Fallback: stringify unknown nodes
|
|
93
|
+
return {"type": "other", "repr": str(expr)}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def hash_column_elements(columns: Sequence[ColumnLike]) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Hash a list of ColumnElements deterministically, dialect agnostic.
|
|
99
|
+
Only accepts ordered iterables (like list or tuple).
|
|
100
|
+
"""
|
|
101
|
+
serialized = [serialize_column_element(c) for c in columns]
|
|
102
|
+
json_str = json.dumps(serialized, sort_keys=True) # stable JSON
|
|
103
|
+
return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def hash_callable(func):
|
|
107
|
+
"""
|
|
108
|
+
Calculate a hash from a callable.
|
|
109
|
+
Rules:
|
|
110
|
+
- Named functions (def) → use source code for stable, cross-version hashing
|
|
111
|
+
- Lambdas → use bytecode (deterministic in same Python runtime)
|
|
112
|
+
"""
|
|
113
|
+
if not callable(func):
|
|
114
|
+
raise TypeError("Expected a callable")
|
|
115
|
+
|
|
116
|
+
# Determine if it is a lambda
|
|
117
|
+
is_lambda = func.__name__ == "<lambda>"
|
|
118
|
+
|
|
119
|
+
if not is_lambda:
|
|
120
|
+
# Try to get exact source of named function
|
|
121
|
+
try:
|
|
122
|
+
lines, _ = inspect.getsourcelines(func)
|
|
123
|
+
payload = textwrap.dedent("".join(lines)).strip()
|
|
124
|
+
except (OSError, TypeError):
|
|
125
|
+
# Fallback: bytecode if source not available
|
|
126
|
+
payload = func.__code__.co_code
|
|
127
|
+
else:
|
|
128
|
+
# For lambdas, fall back directly to bytecode
|
|
129
|
+
payload = func.__code__.co_code
|
|
130
|
+
|
|
131
|
+
# Normalize annotations
|
|
132
|
+
annotations = {
|
|
133
|
+
k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Extras to distinguish functions with same code but different metadata
|
|
137
|
+
extras = {
|
|
138
|
+
"name": func.__name__,
|
|
139
|
+
"defaults": func.__defaults__,
|
|
140
|
+
"annotations": annotations,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Compute SHA256
|
|
144
|
+
h = hashlib.sha256()
|
|
145
|
+
h.update(str(payload).encode() if isinstance(payload, str) else payload)
|
|
146
|
+
h.update(str(extras).encode())
|
|
147
|
+
return h.hexdigest()
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
import hashlib
|
|
2
3
|
import os
|
|
3
4
|
import os.path
|
|
4
5
|
import sys
|
|
@@ -18,6 +19,7 @@ from typing import (
|
|
|
18
19
|
cast,
|
|
19
20
|
overload,
|
|
20
21
|
)
|
|
22
|
+
from uuid import uuid4
|
|
21
23
|
|
|
22
24
|
import sqlalchemy
|
|
23
25
|
import ujson as json
|
|
@@ -207,6 +209,14 @@ class DataChain:
|
|
|
207
209
|
self.print_schema(file=file)
|
|
208
210
|
return file.getvalue()
|
|
209
211
|
|
|
212
|
+
def hash(self) -> str:
|
|
213
|
+
"""
|
|
214
|
+
Calculates SHA hash of this chain. Hash calculation is fast and consistent.
|
|
215
|
+
It takes into account all the steps added to the chain and their inputs.
|
|
216
|
+
Order of the steps is important.
|
|
217
|
+
"""
|
|
218
|
+
return self._query.hash()
|
|
219
|
+
|
|
210
220
|
def _as_delta(
|
|
211
221
|
self,
|
|
212
222
|
on: Optional[Union[str, Sequence[str]]] = None,
|
|
@@ -665,7 +675,7 @@ class DataChain:
|
|
|
665
675
|
name, namespace=namespace_name, project=project_name, **kwargs
|
|
666
676
|
)
|
|
667
677
|
|
|
668
|
-
|
|
678
|
+
result = self._evolve(
|
|
669
679
|
query=self._query.save(
|
|
670
680
|
name=name,
|
|
671
681
|
version=version,
|
|
@@ -678,6 +688,16 @@ class DataChain:
|
|
|
678
688
|
)
|
|
679
689
|
)
|
|
680
690
|
|
|
691
|
+
if job_id := os.getenv("DATACHAIN_JOB_ID"):
|
|
692
|
+
catalog.metastore.create_checkpoint(
|
|
693
|
+
job_id,
|
|
694
|
+
_hash=hashlib.sha256( # TODO this will be replaced with self.hash()
|
|
695
|
+
str(uuid4()).encode()
|
|
696
|
+
).hexdigest(),
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
return result
|
|
700
|
+
|
|
681
701
|
def apply(self, func, *args, **kwargs):
|
|
682
702
|
"""Apply any function to the chain.
|
|
683
703
|
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
2
4
|
import warnings
|
|
3
5
|
from collections.abc import Iterator, Sequence
|
|
4
6
|
from dataclasses import dataclass
|
|
@@ -257,6 +259,11 @@ class SignalSchema:
|
|
|
257
259
|
signals["_custom_types"] = custom_types
|
|
258
260
|
return signals
|
|
259
261
|
|
|
262
|
+
def hash(self) -> str:
|
|
263
|
+
"""Create SHA hash of this schema"""
|
|
264
|
+
json_str = json.dumps(self.serialize(), sort_keys=True, separators=(",", ":"))
|
|
265
|
+
return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
|
|
266
|
+
|
|
260
267
|
@staticmethod
|
|
261
268
|
def _split_subtypes(type_name: str) -> list[str]:
|
|
262
269
|
"""This splits a list of subtypes, including proper square bracket handling."""
|
datachain/lib/udf.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import sys
|
|
2
3
|
import traceback
|
|
3
4
|
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
|
|
@@ -12,6 +13,7 @@ from pydantic import BaseModel
|
|
|
12
13
|
from datachain.asyn import AsyncMapper
|
|
13
14
|
from datachain.cache import temporary_cache
|
|
14
15
|
from datachain.dataset import RowDict
|
|
16
|
+
from datachain.hash_utils import hash_callable
|
|
15
17
|
from datachain.lib.convert.flatten import flatten
|
|
16
18
|
from datachain.lib.file import DataModel, File
|
|
17
19
|
from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
|
|
@@ -61,6 +63,9 @@ class UDFAdapter:
|
|
|
61
63
|
batch_size: Optional[int] = None
|
|
62
64
|
batch: int = 1
|
|
63
65
|
|
|
66
|
+
def hash(self) -> str:
|
|
67
|
+
return self.inner.hash()
|
|
68
|
+
|
|
64
69
|
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
65
70
|
if use_partitioning:
|
|
66
71
|
return Partition()
|
|
@@ -151,6 +156,21 @@ class UDFBase(AbstractUDF):
|
|
|
151
156
|
self.output = None
|
|
152
157
|
self._func = None
|
|
153
158
|
|
|
159
|
+
def hash(self) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Creates SHA hash of this UDF function. It takes into account function,
|
|
162
|
+
inputs and outputs.
|
|
163
|
+
"""
|
|
164
|
+
parts = [
|
|
165
|
+
hash_callable(self._func),
|
|
166
|
+
self.params.hash() if self.params else "",
|
|
167
|
+
self.output.hash(),
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
return hashlib.sha256(
|
|
171
|
+
b"".join([bytes.fromhex(part) for part in parts])
|
|
172
|
+
).hexdigest()
|
|
173
|
+
|
|
154
174
|
def process(self, *args, **kwargs):
|
|
155
175
|
"""Processing function that needs to be defined by user"""
|
|
156
176
|
if not self._func:
|
datachain/query/dataset.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import contextlib
|
|
2
|
+
import hashlib
|
|
2
3
|
import inspect
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
@@ -44,6 +45,7 @@ from datachain.data_storage.schema import (
|
|
|
44
45
|
from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
|
|
45
46
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
46
47
|
from datachain.func.base import Function
|
|
48
|
+
from datachain.hash_utils import hash_column_elements
|
|
47
49
|
from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
|
|
48
50
|
from datachain.lib.signal_schema import SignalSchema
|
|
49
51
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
@@ -57,6 +59,7 @@ from datachain.sql.types import SQLType
|
|
|
57
59
|
from datachain.utils import (
|
|
58
60
|
determine_processes,
|
|
59
61
|
determine_workers,
|
|
62
|
+
ensure_sequence,
|
|
60
63
|
filtered_cloudpickle_dumps,
|
|
61
64
|
get_datachain_executable,
|
|
62
65
|
safe_closing,
|
|
@@ -167,6 +170,18 @@ class Step(ABC):
|
|
|
167
170
|
) -> "StepResult":
|
|
168
171
|
"""Apply the processing step."""
|
|
169
172
|
|
|
173
|
+
@abstractmethod
|
|
174
|
+
def hash_inputs(self) -> str:
|
|
175
|
+
"""Calculates hash of step inputs"""
|
|
176
|
+
|
|
177
|
+
def hash(self) -> str:
|
|
178
|
+
"""
|
|
179
|
+
Calculates hash for step which includes step name and hash of it's inputs
|
|
180
|
+
"""
|
|
181
|
+
return hashlib.sha256(
|
|
182
|
+
f"{self.__class__.__name__}|{self.hash_inputs()}".encode()
|
|
183
|
+
).hexdigest()
|
|
184
|
+
|
|
170
185
|
|
|
171
186
|
@frozen
|
|
172
187
|
class QueryStep:
|
|
@@ -186,6 +201,11 @@ class QueryStep:
|
|
|
186
201
|
q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
|
|
187
202
|
)
|
|
188
203
|
|
|
204
|
+
def hash(self) -> str:
|
|
205
|
+
return hashlib.sha256(
|
|
206
|
+
self.dataset.uri(self.dataset_version).encode()
|
|
207
|
+
).hexdigest()
|
|
208
|
+
|
|
189
209
|
|
|
190
210
|
def generator_then_call(generator, func: Callable):
|
|
191
211
|
"""
|
|
@@ -256,6 +276,13 @@ class DatasetDiffOperation(Step):
|
|
|
256
276
|
class Subtract(DatasetDiffOperation):
|
|
257
277
|
on: Sequence[tuple[str, str]]
|
|
258
278
|
|
|
279
|
+
def hash_inputs(self) -> str:
|
|
280
|
+
on_bytes = b"".join(
|
|
281
|
+
f"{a}:{b}".encode() for a, b in sorted(self.on, key=lambda t: (t[0], t[1]))
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return hashlib.sha256(bytes.fromhex(self.dq.hash()) + on_bytes).hexdigest()
|
|
285
|
+
|
|
259
286
|
def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
|
|
260
287
|
sq = source_query.alias("source_query")
|
|
261
288
|
tq = target_query.alias("target_query")
|
|
@@ -393,6 +420,16 @@ class UDFStep(Step, ABC):
|
|
|
393
420
|
min_task_size: Optional[int] = None
|
|
394
421
|
batch_size: Optional[int] = None
|
|
395
422
|
|
|
423
|
+
def hash_inputs(self) -> str:
|
|
424
|
+
partition_by = ensure_sequence(self.partition_by or [])
|
|
425
|
+
parts = [
|
|
426
|
+
bytes.fromhex(self.udf.hash()),
|
|
427
|
+
bytes.fromhex(hash_column_elements(partition_by)),
|
|
428
|
+
str(self.is_generator).encode(),
|
|
429
|
+
]
|
|
430
|
+
|
|
431
|
+
return hashlib.sha256(b"".join(parts)).hexdigest()
|
|
432
|
+
|
|
396
433
|
@abstractmethod
|
|
397
434
|
def create_udf_table(self, query: Select) -> "Table":
|
|
398
435
|
"""Method that creates a table where temp udf results will be saved"""
|
|
@@ -790,6 +827,9 @@ class SQLClause(Step, ABC):
|
|
|
790
827
|
class SQLSelect(SQLClause):
|
|
791
828
|
args: tuple[Union[Function, ColumnElement], ...]
|
|
792
829
|
|
|
830
|
+
def hash_inputs(self) -> str:
|
|
831
|
+
return hash_column_elements(self.args)
|
|
832
|
+
|
|
793
833
|
def apply_sql_clause(self, query) -> Select:
|
|
794
834
|
subquery = query.subquery()
|
|
795
835
|
args = [
|
|
@@ -806,6 +846,9 @@ class SQLSelect(SQLClause):
|
|
|
806
846
|
class SQLSelectExcept(SQLClause):
|
|
807
847
|
args: tuple[Union[Function, ColumnElement], ...]
|
|
808
848
|
|
|
849
|
+
def hash_inputs(self) -> str:
|
|
850
|
+
return hash_column_elements(self.args)
|
|
851
|
+
|
|
809
852
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
810
853
|
subquery = query.subquery()
|
|
811
854
|
args = [c for c in subquery.c if c.name not in set(self.parse_cols(self.args))]
|
|
@@ -817,6 +860,9 @@ class SQLMutate(SQLClause):
|
|
|
817
860
|
args: tuple[Label, ...]
|
|
818
861
|
new_schema: SignalSchema
|
|
819
862
|
|
|
863
|
+
def hash_inputs(self) -> str:
|
|
864
|
+
return hash_column_elements(self.args)
|
|
865
|
+
|
|
820
866
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
821
867
|
original_subquery = query.subquery()
|
|
822
868
|
to_mutate = {c.name for c in self.args}
|
|
@@ -846,6 +892,9 @@ class SQLMutate(SQLClause):
|
|
|
846
892
|
class SQLFilter(SQLClause):
|
|
847
893
|
expressions: tuple[Union[Function, ColumnElement], ...]
|
|
848
894
|
|
|
895
|
+
def hash_inputs(self) -> str:
|
|
896
|
+
return hash_column_elements(self.expressions)
|
|
897
|
+
|
|
849
898
|
def __and__(self, other):
|
|
850
899
|
expressions = self.parse_cols(self.expressions)
|
|
851
900
|
return self.__class__(expressions + other)
|
|
@@ -859,6 +908,9 @@ class SQLFilter(SQLClause):
|
|
|
859
908
|
class SQLOrderBy(SQLClause):
|
|
860
909
|
args: tuple[Union[Function, ColumnElement], ...]
|
|
861
910
|
|
|
911
|
+
def hash_inputs(self) -> str:
|
|
912
|
+
return hash_column_elements(self.args)
|
|
913
|
+
|
|
862
914
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
863
915
|
args = self.parse_cols(self.args)
|
|
864
916
|
return query.order_by(*args)
|
|
@@ -868,6 +920,9 @@ class SQLOrderBy(SQLClause):
|
|
|
868
920
|
class SQLLimit(SQLClause):
|
|
869
921
|
n: int
|
|
870
922
|
|
|
923
|
+
def hash_inputs(self) -> str:
|
|
924
|
+
return hashlib.sha256(str(self.n).encode()).hexdigest()
|
|
925
|
+
|
|
871
926
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
872
927
|
return query.limit(self.n)
|
|
873
928
|
|
|
@@ -876,12 +931,18 @@ class SQLLimit(SQLClause):
|
|
|
876
931
|
class SQLOffset(SQLClause):
|
|
877
932
|
offset: int
|
|
878
933
|
|
|
934
|
+
def hash_inputs(self) -> str:
|
|
935
|
+
return hashlib.sha256(str(self.offset).encode()).hexdigest()
|
|
936
|
+
|
|
879
937
|
def apply_sql_clause(self, query: "GenerativeSelect"):
|
|
880
938
|
return query.offset(self.offset)
|
|
881
939
|
|
|
882
940
|
|
|
883
941
|
@frozen
|
|
884
942
|
class SQLCount(SQLClause):
|
|
943
|
+
def hash_inputs(self) -> str:
|
|
944
|
+
return ""
|
|
945
|
+
|
|
885
946
|
def apply_sql_clause(self, query):
|
|
886
947
|
return sqlalchemy.select(f.count(1)).select_from(query.subquery())
|
|
887
948
|
|
|
@@ -891,6 +952,9 @@ class SQLDistinct(SQLClause):
|
|
|
891
952
|
args: tuple[ColumnElement, ...]
|
|
892
953
|
dialect: str
|
|
893
954
|
|
|
955
|
+
def hash_inputs(self) -> str:
|
|
956
|
+
return hash_column_elements(self.args)
|
|
957
|
+
|
|
894
958
|
def apply_sql_clause(self, query):
|
|
895
959
|
if self.dialect == "sqlite":
|
|
896
960
|
return query.group_by(*self.args)
|
|
@@ -903,6 +967,11 @@ class SQLUnion(Step):
|
|
|
903
967
|
query1: "DatasetQuery"
|
|
904
968
|
query2: "DatasetQuery"
|
|
905
969
|
|
|
970
|
+
def hash_inputs(self) -> str:
|
|
971
|
+
return hashlib.sha256(
|
|
972
|
+
bytes.fromhex(self.query1.hash()) + bytes.fromhex(self.query2.hash())
|
|
973
|
+
).hexdigest()
|
|
974
|
+
|
|
906
975
|
def apply(
|
|
907
976
|
self, query_generator: QueryGenerator, temp_tables: list[str]
|
|
908
977
|
) -> StepResult:
|
|
@@ -939,6 +1008,20 @@ class SQLJoin(Step):
|
|
|
939
1008
|
full: bool
|
|
940
1009
|
rname: str
|
|
941
1010
|
|
|
1011
|
+
def hash_inputs(self) -> str:
|
|
1012
|
+
predicates = ensure_sequence(self.predicates or [])
|
|
1013
|
+
|
|
1014
|
+
parts = [
|
|
1015
|
+
bytes.fromhex(self.query1.hash()),
|
|
1016
|
+
bytes.fromhex(self.query2.hash()),
|
|
1017
|
+
bytes.fromhex(hash_column_elements(predicates)),
|
|
1018
|
+
str(self.inner).encode(),
|
|
1019
|
+
str(self.full).encode(),
|
|
1020
|
+
self.rname.encode("utf-8"),
|
|
1021
|
+
]
|
|
1022
|
+
|
|
1023
|
+
return hashlib.sha256(b"".join(parts)).hexdigest()
|
|
1024
|
+
|
|
942
1025
|
def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
|
|
943
1026
|
query = dq.apply_steps().select()
|
|
944
1027
|
temp_tables.extend(dq.temp_table_names)
|
|
@@ -1060,6 +1143,13 @@ class SQLGroupBy(SQLClause):
|
|
|
1060
1143
|
cols: Sequence[Union[str, Function, ColumnElement]]
|
|
1061
1144
|
group_by: Sequence[Union[str, Function, ColumnElement]]
|
|
1062
1145
|
|
|
1146
|
+
def hash_inputs(self) -> str:
|
|
1147
|
+
return hashlib.sha256(
|
|
1148
|
+
bytes.fromhex(
|
|
1149
|
+
hash_column_elements(self.cols) + hash_column_elements(self.group_by)
|
|
1150
|
+
)
|
|
1151
|
+
).hexdigest()
|
|
1152
|
+
|
|
1063
1153
|
def apply_sql_clause(self, query) -> Select:
|
|
1064
1154
|
if not self.cols:
|
|
1065
1155
|
raise ValueError("No columns to select")
|
|
@@ -1213,6 +1303,23 @@ class DatasetQuery:
|
|
|
1213
1303
|
def __or__(self, other):
|
|
1214
1304
|
return self.union(other)
|
|
1215
1305
|
|
|
1306
|
+
def hash(self) -> str:
|
|
1307
|
+
"""
|
|
1308
|
+
Calculates hash of this class taking into account hash of starting step
|
|
1309
|
+
and hashes of each following steps. Ordering is important.
|
|
1310
|
+
"""
|
|
1311
|
+
hasher = hashlib.sha256()
|
|
1312
|
+
if self.starting_step:
|
|
1313
|
+
hasher.update(self.starting_step.hash().encode("utf-8"))
|
|
1314
|
+
else:
|
|
1315
|
+
assert self.list_ds_name
|
|
1316
|
+
hasher.update(self.list_ds_name.encode("utf-8"))
|
|
1317
|
+
|
|
1318
|
+
for step in self.steps:
|
|
1319
|
+
hasher.update(step.hash().encode("utf-8"))
|
|
1320
|
+
|
|
1321
|
+
return hasher.hexdigest()
|
|
1322
|
+
|
|
1216
1323
|
@staticmethod
|
|
1217
1324
|
def get_table() -> "TableClause":
|
|
1218
1325
|
table_name = "".join(
|
datachain/utils.py
CHANGED
|
@@ -537,3 +537,9 @@ def getenv_bool(name: str, default: bool = False) -> bool:
|
|
|
537
537
|
if val is None:
|
|
538
538
|
return default
|
|
539
539
|
return val.lower() in ("1", "true", "yes", "on")
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def ensure_sequence(x) -> Sequence:
|
|
543
|
+
if isinstance(x, Sequence) and not isinstance(x, (str, bytes)):
|
|
544
|
+
return x
|
|
545
|
+
return [x]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.33.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -86,6 +86,7 @@ Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
|
|
|
86
86
|
Provides-Extra: tests
|
|
87
87
|
Requires-Dist: datachain[audio,hf,postgres,remote,torch,vector,video]; extra == "tests"
|
|
88
88
|
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
89
|
+
Requires-Dist: pytest-asyncio; extra == "tests"
|
|
89
90
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
90
91
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
91
92
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
@@ -102,7 +103,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
102
103
|
Requires-Dist: ultralytics; extra == "tests"
|
|
103
104
|
Provides-Extra: dev
|
|
104
105
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
105
|
-
Requires-Dist: mypy==1.18.
|
|
106
|
+
Requires-Dist: mypy==1.18.2; extra == "dev"
|
|
106
107
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
107
108
|
Requires-Dist: types-dateparser; extra == "dev"
|
|
108
109
|
Requires-Dist: types-pytz; extra == "dev"
|
|
@@ -2,10 +2,12 @@ datachain/__init__.py,sha256=BRqfLPoBRRycnndaxyba-i4ZrZCJl0As2pwV9RiNBr8,1822
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
|
|
5
|
+
datachain/checkpoint.py,sha256=Ar6SnnDMN3fr5ZZm3Xpdbj2f9buhqeApad-B1Lyrr4Y,1152
|
|
5
6
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
7
|
datachain/dataset.py,sha256=eX7xGa3EUpAccBZWpkgDmYV6_FjGuhjkMLFHpjl6lVI,25256
|
|
7
8
|
datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
|
|
8
|
-
datachain/error.py,sha256=
|
|
9
|
+
datachain/error.py,sha256=WR1MoO9BPI0hO1FVKVTS0hgyxxumywtDnSY7Sv1oE1c,1796
|
|
10
|
+
datachain/hash_utils.py,sha256=tgyXlz1m0gsS3UkIxdb0fxtNfVsbO2-YrELtyGV5XYE,4515
|
|
9
11
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
10
12
|
datachain/listing.py,sha256=aqayl5St3D9PwdwM6nR1STkpLSw-S3U8pudO9PWi3N8,7241
|
|
11
13
|
datachain/namespace.py,sha256=sgIF90KEaC_VlMFivDIJiFz8RUsTftMxW4kOUTyxo3A,2356
|
|
@@ -19,9 +21,9 @@ datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
|
19
21
|
datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
|
|
20
22
|
datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
|
|
21
23
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
22
|
-
datachain/utils.py,sha256=
|
|
24
|
+
datachain/utils.py,sha256=yW-Df5R6npqcqlNZMlBRBwyhUFmXpl9sQipPmy9HfQU,15797
|
|
23
25
|
datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
|
|
24
|
-
datachain/catalog/catalog.py,sha256=
|
|
26
|
+
datachain/catalog/catalog.py,sha256=oI4YBuuOJGVx_Fp1cDoFb56lPV7Or27ZquzR8oM1m3Y,69133
|
|
25
27
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
26
28
|
datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
|
|
27
29
|
datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
|
|
@@ -41,20 +43,21 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
|
|
|
41
43
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
42
44
|
datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
|
|
43
45
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
44
|
-
datachain/client/fsspec.py,sha256=
|
|
46
|
+
datachain/client/fsspec.py,sha256=urt-b9Osay-S4LmwyXUKyYp-JHUBlFewoUvYNP7W_Jw,14553
|
|
45
47
|
datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
|
|
46
48
|
datachain/client/hf.py,sha256=n5xJZdvNLS-SqokxuBCIPfGbhIeC_XfLm_BNYtEVvg4,2677
|
|
49
|
+
datachain/client/http.py,sha256=oU4nxaOa3xNXkxprDjjIS5fufgRJS0eNHTau3FUC6sg,5171
|
|
47
50
|
datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,4607
|
|
48
51
|
datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
|
|
49
52
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
50
53
|
datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
|
|
51
|
-
datachain/data_storage/job.py,sha256=
|
|
52
|
-
datachain/data_storage/metastore.py,sha256=
|
|
53
|
-
datachain/data_storage/schema.py,sha256=
|
|
54
|
+
datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
|
|
55
|
+
datachain/data_storage/metastore.py,sha256=TgLYAKraH1WsmteaAqO5TW2VzNZZM4_SASgcBlDzdr8,60218
|
|
56
|
+
datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
|
|
54
57
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
55
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
58
|
+
datachain/data_storage/sqlite.py,sha256=Z6KlFk7hWoXBbjzxfk2NuIBecqP86AJzp5iEE2W4yw0,30603
|
|
56
59
|
datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
|
|
57
|
-
datachain/diff/__init__.py,sha256
|
|
60
|
+
datachain/diff/__init__.py,sha256=v03JfMxH1VvwFl3rniedS4YWs6EXSfaLCULJTKNECE4,9603
|
|
58
61
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
62
|
datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
|
|
60
63
|
datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
|
|
@@ -86,10 +89,10 @@ datachain/lib/namespaces.py,sha256=ZyIYUa3WMrv6R5HrSoLsmLiEbvUQDl8sBINLUmWOYG0,3
|
|
|
86
89
|
datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
|
|
87
90
|
datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
|
|
88
91
|
datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
|
|
89
|
-
datachain/lib/signal_schema.py,sha256=
|
|
92
|
+
datachain/lib/signal_schema.py,sha256=WDFLbzXEOhgv865TePcFpLQHxsKQHtn8kTzaQGUG_XA,39479
|
|
90
93
|
datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
|
|
91
94
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
92
|
-
datachain/lib/udf.py,sha256=
|
|
95
|
+
datachain/lib/udf.py,sha256=DdUxGBo9Y7Jz6aTBKgwex7YfK1RNaGm1JUlXCqs7qnw,18122
|
|
93
96
|
datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
|
|
94
97
|
datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
|
|
95
98
|
datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
|
|
@@ -104,7 +107,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
|
|
|
104
107
|
datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
|
|
105
108
|
datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
|
|
106
109
|
datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
|
|
107
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
110
|
+
datachain/lib/dc/datachain.py,sha256=FBz-IzbLeh8cS8yI2WiGBkLjV4fN7YqqqnCuuuj0S-o,101111
|
|
108
111
|
datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
|
|
109
112
|
datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
|
|
110
113
|
datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
|
|
@@ -127,7 +130,7 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
|
|
|
127
130
|
datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
|
|
128
131
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
129
132
|
datachain/query/batch.py,sha256=ocPeNgrJM6Y_6SYCx3O2cwlCFAhNMfoYgB99GP6A1Bg,4294
|
|
130
|
-
datachain/query/dataset.py,sha256=
|
|
133
|
+
datachain/query/dataset.py,sha256=P7pyRiWc9G3AfzxvyB2yToKW3bXoUCrfFOtFdiVbCrU,67836
|
|
131
134
|
datachain/query/dispatch.py,sha256=pygp7xg3lUDKlYHhecKxW5fB3zOSX1fPJfZBU4dfijk,16067
|
|
132
135
|
datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
|
|
133
136
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -161,9 +164,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
161
164
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
162
165
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
163
166
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
164
|
-
datachain-0.
|
|
165
|
-
datachain-0.
|
|
166
|
-
datachain-0.
|
|
167
|
-
datachain-0.
|
|
168
|
-
datachain-0.
|
|
169
|
-
datachain-0.
|
|
167
|
+
datachain-0.33.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
168
|
+
datachain-0.33.1.dist-info/METADATA,sha256=1D-XqF5TtHydJqpLRIRpld9UKQftLhw_RkDUjI_NE2c,13655
|
|
169
|
+
datachain-0.33.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
170
|
+
datachain-0.33.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
171
|
+
datachain-0.33.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
172
|
+
datachain-0.33.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|