datachain 0.32.3__py3-none-any.whl → 0.33.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -144,19 +144,26 @@ def shutdown_process(
144
144
  return proc.wait()
145
145
 
146
146
 
147
- def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
147
+ def process_output(stream: IO[bytes], callback: Callable[[str], None]) -> None:
148
148
  buffer = b""
149
- while byt := stream.read(1): # Read one byte at a time
150
- buffer += byt
151
149
 
152
- if byt in (b"\n", b"\r"): # Check for newline or carriage return
153
- line = buffer.decode("utf-8")
154
- callback(line)
155
- buffer = b"" # Clear buffer for next line
150
+ try:
151
+ while byt := stream.read(1): # Read one byte at a time
152
+ buffer += byt
156
153
 
157
- if buffer: # Handle any remaining data in the buffer
158
- line = buffer.decode("utf-8")
159
- callback(line)
154
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
155
+ line = buffer.decode("utf-8", errors="replace")
156
+ callback(line)
157
+ buffer = b"" # Clear buffer for the next line
158
+
159
+ if buffer: # Handle any remaining data in the buffer
160
+ line = buffer.decode("utf-8", errors="replace")
161
+ callback(line)
162
+ finally:
163
+ try:
164
+ stream.close() # Ensure output is closed
165
+ except Exception: # noqa: BLE001, S110
166
+ pass
160
167
 
161
168
 
162
169
  class DatasetRowsFetcher(NodesThreadPool):
@@ -1760,13 +1767,13 @@ class Catalog:
1760
1767
  recursive=recursive,
1761
1768
  )
1762
1769
 
1770
+ @staticmethod
1763
1771
  def query(
1764
- self,
1765
1772
  query_script: str,
1766
1773
  env: Optional[Mapping[str, str]] = None,
1767
1774
  python_executable: str = sys.executable,
1768
- capture_output: bool = False,
1769
- output_hook: Callable[[str], None] = noop,
1775
+ stdout_callback: Optional[Callable[[str], None]] = None,
1776
+ stderr_callback: Optional[Callable[[str], None]] = None,
1770
1777
  params: Optional[dict[str, str]] = None,
1771
1778
  job_id: Optional[str] = None,
1772
1779
  interrupt_timeout: Optional[int] = None,
@@ -1781,13 +1788,18 @@ class Catalog:
1781
1788
  },
1782
1789
  )
1783
1790
  popen_kwargs: dict[str, Any] = {}
1784
- if capture_output:
1785
- popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1791
+
1792
+ if stdout_callback is not None:
1793
+ popen_kwargs = {"stdout": subprocess.PIPE}
1794
+ if stderr_callback is not None:
1795
+ popen_kwargs["stderr"] = subprocess.PIPE
1786
1796
 
1787
1797
  def raise_termination_signal(sig: int, _: Any) -> NoReturn:
1788
1798
  raise TerminationSignal(sig)
1789
1799
 
1790
- thread: Optional[Thread] = None
1800
+ stdout_thread: Optional[Thread] = None
1801
+ stderr_thread: Optional[Thread] = None
1802
+
1791
1803
  with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
1792
1804
  logger.info("Starting process %s", proc.pid)
1793
1805
 
@@ -1801,10 +1813,20 @@ class Catalog:
1801
1813
  orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
1802
1814
  signal.signal(signal.SIGTERM, raise_termination_signal)
1803
1815
  try:
1804
- if capture_output:
1805
- args = (proc.stdout, output_hook)
1806
- thread = Thread(target=_process_stream, args=args, daemon=True)
1807
- thread.start()
1816
+ if stdout_callback is not None:
1817
+ stdout_thread = Thread(
1818
+ target=process_output,
1819
+ args=(proc.stdout, stdout_callback),
1820
+ daemon=True,
1821
+ )
1822
+ stdout_thread.start()
1823
+ if stderr_callback is not None:
1824
+ stderr_thread = Thread(
1825
+ target=process_output,
1826
+ args=(proc.stderr, stderr_callback),
1827
+ daemon=True,
1828
+ )
1829
+ stderr_thread.start()
1808
1830
 
1809
1831
  proc.wait()
1810
1832
  except TerminationSignal as exc:
@@ -1822,8 +1844,22 @@ class Catalog:
1822
1844
  finally:
1823
1845
  signal.signal(signal.SIGTERM, orig_sigterm_handler)
1824
1846
  signal.signal(signal.SIGINT, orig_sigint_handler)
1825
- if thread:
1826
- thread.join() # wait for the reader thread
1847
+ # wait for the reader thread
1848
+ thread_join_timeout_seconds = 30
1849
+ if stdout_thread is not None:
1850
+ stdout_thread.join(timeout=thread_join_timeout_seconds)
1851
+ if stdout_thread.is_alive():
1852
+ logger.warning(
1853
+ "stdout thread is still alive after %s seconds",
1854
+ thread_join_timeout_seconds,
1855
+ )
1856
+ if stderr_thread is not None:
1857
+ stderr_thread.join(timeout=thread_join_timeout_seconds)
1858
+ if stderr_thread.is_alive():
1859
+ logger.warning(
1860
+ "stderr thread is still alive after %s seconds",
1861
+ thread_join_timeout_seconds,
1862
+ )
1827
1863
 
1828
1864
  logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1829
1865
  if proc.returncode in (
@@ -0,0 +1,44 @@
1
+ import uuid
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import Union
5
+
6
+
7
+ @dataclass
8
+ class Checkpoint:
9
+ """
10
+ Represents a checkpoint within a job run.
11
+
12
+ A checkpoint marks a successfully completed stage of execution. In the event
13
+ of a failure, the job can resume from the most recent checkpoint rather than
14
+ starting over from the beginning.
15
+
16
+ Checkpoints can also be created in a "partial" mode, which indicates that the
17
+ work at this stage was only partially completed. For example, if a failure
18
+ occurs halfway through running a UDF, already computed results can still be
19
+ saved, allowing the job to resume from that partially completed state on
20
+ restart.
21
+ """
22
+
23
+ id: str
24
+ job_id: str
25
+ hash: str
26
+ partial: bool
27
+ created_at: datetime
28
+
29
+ @classmethod
30
+ def parse(
31
+ cls,
32
+ id: Union[str, uuid.UUID],
33
+ job_id: str,
34
+ _hash: str,
35
+ partial: bool,
36
+ created_at: datetime,
37
+ ) -> "Checkpoint":
38
+ return cls(
39
+ str(id),
40
+ job_id,
41
+ _hash,
42
+ bool(partial),
43
+ created_at,
44
+ )
@@ -93,10 +93,11 @@ class Client(ABC):
93
93
  self.uri = self.get_uri(self.name)
94
94
 
95
95
  @staticmethod
96
- def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
96
+ def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]: # noqa: PLR0911
97
97
  from .azure import AzureClient
98
98
  from .gcs import GCSClient
99
99
  from .hf import HfClient
100
+ from .http import HTTPClient, HTTPSClient
100
101
  from .local import FileClient
101
102
  from .s3 import ClientS3
102
103
 
@@ -114,6 +115,10 @@ class Client(ABC):
114
115
  return FileClient
115
116
  if protocol == HfClient.protocol:
116
117
  return HfClient
118
+ if protocol == HTTPClient.protocol:
119
+ return HTTPClient
120
+ if protocol == HTTPSClient.protocol:
121
+ return HTTPSClient
117
122
 
118
123
  raise NotImplementedError(f"Unsupported protocol: {protocol}")
119
124
 
@@ -0,0 +1,157 @@
1
+ from datetime import datetime, timezone
2
+ from typing import TYPE_CHECKING, Any, ClassVar, Optional, cast
3
+ from urllib.parse import urlparse
4
+
5
+ from fsspec.implementations.http import HTTPFileSystem
6
+
7
+ from datachain.dataset import StorageURI
8
+ from datachain.lib.file import File
9
+
10
+ from .fsspec import Client
11
+
12
+ if TYPE_CHECKING:
13
+ from datachain.cache import Cache
14
+
15
+
16
+ class HTTPClient(Client):
17
+ FS_CLASS = HTTPFileSystem
18
+ PREFIX: ClassVar[str] = "http://"
19
+ protocol: ClassVar[str] = "http"
20
+
21
+ @classmethod
22
+ def create_fs(cls, **kwargs) -> HTTPFileSystem:
23
+ # Configure HTTPFileSystem options
24
+ kwargs.setdefault("simple_links", True)
25
+ kwargs.setdefault("same_scheme", True)
26
+ kwargs.setdefault("cache_type", "bytes")
27
+
28
+ kwargs.pop("version_aware", None)
29
+
30
+ fs = cls.FS_CLASS(**kwargs)
31
+ fs.invalidate_cache()
32
+ return cast("HTTPFileSystem", fs)
33
+
34
+ @classmethod
35
+ def from_name(
36
+ cls,
37
+ name: str,
38
+ cache: "Cache",
39
+ kwargs: dict[str, Any],
40
+ ) -> "HTTPClient":
41
+ parsed = urlparse(name)
42
+
43
+ if parsed.scheme:
44
+ name = parsed.netloc + parsed.path
45
+
46
+ return cls(name, kwargs, cache)
47
+
48
+ @classmethod
49
+ def split_url(cls, url: str) -> tuple[str, str]:
50
+ """Split HTTP/HTTPS URL into domain (bucket equivalent) and path."""
51
+ parsed = urlparse(url)
52
+ domain = parsed.netloc
53
+ path = parsed.path.lstrip("/")
54
+
55
+ if parsed.query:
56
+ path += f"?{parsed.query}"
57
+ if parsed.fragment:
58
+ path += f"#{parsed.fragment}"
59
+
60
+ return domain, path
61
+
62
+ @classmethod
63
+ def get_uri(cls, name: str) -> "StorageURI":
64
+ if not name.startswith(("http://", "https://")):
65
+ return StorageURI(f"{cls.PREFIX}{name}")
66
+ return StorageURI(name)
67
+
68
+ @classmethod
69
+ def is_root_url(cls, url: str) -> bool:
70
+ parsed = urlparse(url)
71
+ return parsed.path in ("", "/") and not parsed.query and not parsed.fragment
72
+
73
+ def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
74
+ if self.name.startswith(("http://", "https://")):
75
+ base_url = self.name
76
+ else:
77
+ if rel_path and "/" in rel_path:
78
+ first_part = rel_path.split("/")[0]
79
+ if "." in first_part and not first_part.startswith("."):
80
+ return f"{self.protocol}://{rel_path}"
81
+
82
+ base_url = f"{self.protocol}://{self.name}"
83
+
84
+ if rel_path:
85
+ if not base_url.endswith("/") and not rel_path.startswith("/"):
86
+ base_url += "/"
87
+ full_url = base_url + rel_path
88
+ else:
89
+ full_url = base_url
90
+
91
+ return full_url
92
+
93
+ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
94
+ """
95
+ Generate URL for the given path.
96
+ Note: HTTP URLs don't support signed/expiring URLs.
97
+ """
98
+ return self.get_full_path(path, kwargs.pop("version_id", None))
99
+
100
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
101
+ etag = v.get("ETag", "").strip('"')
102
+ last_modified = v.get("last_modified")
103
+ if last_modified:
104
+ if isinstance(last_modified, str):
105
+ try:
106
+ from email.utils import parsedate_to_datetime
107
+
108
+ last_modified = parsedate_to_datetime(last_modified)
109
+ except (ValueError, TypeError):
110
+ last_modified = datetime.now(timezone.utc)
111
+ elif isinstance(last_modified, (int, float)):
112
+ last_modified = datetime.fromtimestamp(last_modified, timezone.utc)
113
+ else:
114
+ last_modified = datetime.now(timezone.utc)
115
+
116
+ return File(
117
+ source=self.uri,
118
+ path=path,
119
+ size=v.get("size", 0),
120
+ etag=etag,
121
+ version="",
122
+ is_latest=True,
123
+ last_modified=last_modified,
124
+ )
125
+
126
+ def upload(self, data: bytes, path: str) -> "File":
127
+ raise NotImplementedError(
128
+ "HTTP/HTTPS client is read-only. Upload operations are not supported."
129
+ )
130
+
131
+ def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
132
+ info = self.fs.info(self.get_full_path(path))
133
+ return self.info_to_file(info, path)
134
+
135
+ def open_object(self, file: "File", use_cache: bool = True, cb=None):
136
+ from datachain.client.fileslice import FileWrapper
137
+
138
+ if use_cache and (cache_path := self.cache.get_path(file)):
139
+ return open(cache_path, mode="rb")
140
+
141
+ assert not file.location
142
+ return FileWrapper(
143
+ self.fs.open(self.get_full_path(file.get_path_normalized())),
144
+ cb or (lambda x: None),
145
+ )
146
+
147
+ async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
148
+ return await self.fs._get_file(lpath, rpath, callback=callback)
149
+
150
+ async def _fetch_dir(self, prefix: str, pbar, result_queue) -> set[str]:
151
+ full_url = self.get_full_path(prefix)
152
+ raise NotImplementedError(f"Cannot download file from {full_url}")
153
+
154
+
155
+ class HTTPSClient(HTTPClient):
156
+ protocol = "https"
157
+ PREFIX = "https://"
@@ -4,6 +4,7 @@ from enum import Enum
4
4
  class JobStatus(int, Enum):
5
5
  CREATED = 1
6
6
  SCHEDULED = 10
7
+ PROVISIONING = 12
7
8
  QUEUED = 2
8
9
  INIT = 3
9
10
  RUNNING = 4
@@ -13,6 +13,7 @@ from uuid import uuid4
13
13
  from sqlalchemy import (
14
14
  JSON,
15
15
  BigInteger,
16
+ Boolean,
16
17
  Column,
17
18
  DateTime,
18
19
  ForeignKey,
@@ -24,6 +25,7 @@ from sqlalchemy import (
24
25
  )
25
26
  from sqlalchemy.sql import func as f
26
27
 
28
+ from datachain.checkpoint import Checkpoint
27
29
  from datachain.data_storage import JobQueryType, JobStatus
28
30
  from datachain.data_storage.serializer import Serializable
29
31
  from datachain.dataset import (
@@ -36,6 +38,7 @@ from datachain.dataset import (
36
38
  StorageURI,
37
39
  )
38
40
  from datachain.error import (
41
+ CheckpointNotFoundError,
39
42
  DatasetNotFoundError,
40
43
  DatasetVersionNotFoundError,
41
44
  NamespaceDeleteNotAllowedError,
@@ -75,6 +78,7 @@ class AbstractMetastore(ABC, Serializable):
75
78
  dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
76
79
  dependency_class: type[DatasetDependency] = DatasetDependency
77
80
  job_class: type[Job] = Job
81
+ checkpoint_class: type[Checkpoint] = Checkpoint
78
82
 
79
83
  def __init__(
80
84
  self,
@@ -431,6 +435,35 @@ class AbstractMetastore(ABC, Serializable):
431
435
  def get_job_status(self, job_id: str) -> Optional[JobStatus]:
432
436
  """Returns the status of the given job."""
433
437
 
438
+ #
439
+ # Checkpoints
440
+ #
441
+
442
+ @abstractmethod
443
+ def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
444
+ """Returns all checkpoints related to some job"""
445
+
446
+ @abstractmethod
447
+ def get_checkpoint_by_id(self, checkpoint_id: str, conn=None) -> Checkpoint:
448
+ """Gets single checkpoint by id"""
449
+
450
+ def find_checkpoint(
451
+ self, job_id: str, _hash: str, partial: bool = False, conn=None
452
+ ) -> Optional[Checkpoint]:
453
+ """
454
+ Tries to find checkpoint for a job with specific hash and optionally partial
455
+ """
456
+
457
+ @abstractmethod
458
+ def create_checkpoint(
459
+ self,
460
+ job_id: str,
461
+ _hash: str,
462
+ partial: bool = False,
463
+ conn: Optional[Any] = None,
464
+ ) -> Checkpoint:
465
+ """Creates new checkpoint"""
466
+
434
467
 
435
468
  class AbstractDBMetastore(AbstractMetastore):
436
469
  """
@@ -446,6 +479,7 @@ class AbstractDBMetastore(AbstractMetastore):
446
479
  DATASET_VERSION_TABLE = "datasets_versions"
447
480
  DATASET_DEPENDENCY_TABLE = "datasets_dependencies"
448
481
  JOBS_TABLE = "jobs"
482
+ CHECKPOINTS_TABLE = "checkpoints"
449
483
 
450
484
  db: "DatabaseEngine"
451
485
 
@@ -1663,3 +1697,106 @@ class AbstractDBMetastore(AbstractMetastore):
1663
1697
  if not results:
1664
1698
  return None
1665
1699
  return results[0][0]
1700
+
1701
+ #
1702
+ # Checkpoints
1703
+ #
1704
+
1705
+ @staticmethod
1706
+ def _checkpoints_columns() -> "list[SchemaItem]":
1707
+ return [
1708
+ Column(
1709
+ "id",
1710
+ Text,
1711
+ default=uuid4,
1712
+ primary_key=True,
1713
+ nullable=False,
1714
+ ),
1715
+ Column("job_id", Text, nullable=True),
1716
+ Column("hash", Text, nullable=False),
1717
+ Column("partial", Boolean, default=False),
1718
+ Column("created_at", DateTime(timezone=True), nullable=False),
1719
+ UniqueConstraint("job_id", "hash"),
1720
+ ]
1721
+
1722
+ @cached_property
1723
+ def _checkpoints_fields(self) -> list[str]:
1724
+ return [c.name for c in self._checkpoints_columns() if c.name] # type: ignore[attr-defined]
1725
+
1726
+ @cached_property
1727
+ def _checkpoints(self) -> "Table":
1728
+ return Table(
1729
+ self.CHECKPOINTS_TABLE,
1730
+ self.db.metadata,
1731
+ *self._checkpoints_columns(),
1732
+ )
1733
+
1734
+ @abstractmethod
1735
+ def _checkpoints_insert(self) -> "Insert": ...
1736
+
1737
+ def _checkpoints_select(self, *columns) -> "Select":
1738
+ if not columns:
1739
+ return self._checkpoints.select()
1740
+ return select(*columns)
1741
+
1742
+ def _checkpoints_delete(self) -> "Delete":
1743
+ return self._checkpoints.delete()
1744
+
1745
+ def _checkpoints_query(self):
1746
+ return self._checkpoints_select(
1747
+ *[getattr(self._checkpoints.c, f) for f in self._checkpoints_fields]
1748
+ )
1749
+
1750
+ def create_checkpoint(
1751
+ self,
1752
+ job_id: str,
1753
+ _hash: str,
1754
+ partial: bool = False,
1755
+ conn: Optional[Any] = None,
1756
+ ) -> Checkpoint:
1757
+ """
1758
+ Creates a new job query step.
1759
+ """
1760
+ checkpoint_id = str(uuid4())
1761
+ self.db.execute(
1762
+ self._checkpoints_insert().values(
1763
+ id=checkpoint_id,
1764
+ job_id=job_id,
1765
+ hash=_hash,
1766
+ partial=partial,
1767
+ created_at=datetime.now(timezone.utc),
1768
+ ),
1769
+ conn=conn,
1770
+ )
1771
+ return self.get_checkpoint_by_id(checkpoint_id)
1772
+
1773
+ def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
1774
+ """List checkpoints by job id."""
1775
+ query = self._checkpoints_query().where(self._checkpoints.c.job_id == job_id)
1776
+ rows = list(self.db.execute(query, conn=conn))
1777
+
1778
+ yield from [self.checkpoint_class.parse(*r) for r in rows]
1779
+
1780
+ def get_checkpoint_by_id(self, checkpoint_id: str, conn=None) -> Checkpoint:
1781
+ """Returns the checkpoint with the given ID."""
1782
+ ch = self._checkpoints
1783
+ query = self._checkpoints_select(ch).where(ch.c.id == checkpoint_id)
1784
+ rows = list(self.db.execute(query, conn=conn))
1785
+ if not rows:
1786
+ raise CheckpointNotFoundError(f"Checkpoint {checkpoint_id} not found")
1787
+ return self.checkpoint_class.parse(*rows[0])
1788
+
1789
+ def find_checkpoint(
1790
+ self, job_id: str, _hash: str, partial: bool = False, conn=None
1791
+ ) -> Optional[Checkpoint]:
1792
+ """
1793
+ Tries to find checkpoint for a job with specific hash and optionally partial
1794
+ """
1795
+ ch = self._checkpoints
1796
+ query = self._checkpoints_select(ch).where(
1797
+ ch.c.job_id == job_id, ch.c.hash == _hash, ch.c.partial == partial
1798
+ )
1799
+ rows = list(self.db.execute(query, conn=conn))
1800
+ if not rows:
1801
+ return None
1802
+ return self.checkpoint_class.parse(*rows[0])
@@ -51,7 +51,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
51
51
  """
52
52
  c_set: dict[str, sa.Column] = {}
53
53
  for c in columns:
54
- if (ec := c_set.get(c.name, None)) is not None:
54
+ if (ec := c_set.get(c.name)) is not None:
55
55
  if str(ec.type) != str(c.type):
56
56
  raise ValueError(
57
57
  f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
@@ -459,6 +459,8 @@ class SQLiteMetastore(AbstractDBMetastore):
459
459
  self.default_table_names.append(self._datasets_dependencies.name)
460
460
  self.db.create_table(self._jobs, if_not_exists=True)
461
461
  self.default_table_names.append(self._jobs.name)
462
+ self.db.create_table(self._checkpoints, if_not_exists=True)
463
+ self.default_table_names.append(self._checkpoints.name)
462
464
 
463
465
  def _init_namespaces_projects(self) -> None:
464
466
  """
@@ -543,6 +545,12 @@ class SQLiteMetastore(AbstractDBMetastore):
543
545
  def _jobs_insert(self) -> "Insert":
544
546
  return sqlite.insert(self._jobs)
545
547
 
548
+ #
549
+ # Checkpoints
550
+ #
551
+ def _checkpoints_insert(self) -> "Insert":
552
+ return sqlite.insert(self._checkpoints)
553
+
546
554
  #
547
555
  # Namespaces
548
556
  #
@@ -1,5 +1,3 @@
1
- import random
2
- import string
3
1
  from collections.abc import Sequence
4
2
  from enum import Enum
5
3
  from typing import TYPE_CHECKING, Optional, Union
@@ -11,16 +9,12 @@ from datachain.query.schema import Column
11
9
  if TYPE_CHECKING:
12
10
  from datachain.lib.dc import DataChain
13
11
 
14
-
15
12
  C = Column
16
13
 
17
14
 
18
- def get_status_col_name() -> str:
19
- """Returns new unique status col name"""
20
- return "diff_" + "".join(
21
- random.choice(string.ascii_letters) # noqa: S311
22
- for _ in range(10)
23
- )
15
+ STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
16
+ LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
17
+ RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
24
18
 
25
19
 
26
20
  class CompareStatus(str, Enum):
@@ -101,9 +95,9 @@ def _compare( # noqa: C901, PLR0912
101
95
  compare = right_compare = [c for c in cols if c in right_cols and c not in on] # type: ignore[misc]
102
96
 
103
97
  # get diff column names
104
- diff_col = status_col or get_status_col_name()
105
- ldiff_col = get_status_col_name()
106
- rdiff_col = get_status_col_name()
98
+ diff_col = status_col or STATUS_COL_NAME
99
+ ldiff_col = LEFT_DIFF_COL_NAME
100
+ rdiff_col = RIGHT_DIFF_COL_NAME
107
101
 
108
102
  # adding helper diff columns, which will be removed after
109
103
  left = left.mutate(**{ldiff_col: 1})
@@ -227,7 +221,7 @@ def compare_and_split(
227
221
  )
228
222
  ```
229
223
  """
230
- status_col = get_status_col_name()
224
+ status_col = STATUS_COL_NAME
231
225
 
232
226
  res = _compare(
233
227
  left,
datachain/error.py CHANGED
@@ -97,3 +97,7 @@ class TableMissingError(DataChainError):
97
97
 
98
98
  class OutdatedDatabaseSchemaError(DataChainError):
99
99
  pass
100
+
101
+
102
+ class CheckpointNotFoundError(NotFoundError):
103
+ pass
@@ -0,0 +1,147 @@
1
+ import hashlib
2
+ import inspect
3
+ import json
4
+ import textwrap
5
+ from collections.abc import Sequence
6
+ from typing import TypeVar, Union
7
+
8
+ from sqlalchemy.sql.elements import (
9
+ BinaryExpression,
10
+ BindParameter,
11
+ ColumnElement,
12
+ Label,
13
+ Over,
14
+ UnaryExpression,
15
+ )
16
+ from sqlalchemy.sql.functions import Function
17
+
18
+ T = TypeVar("T", bound=ColumnElement)
19
+ ColumnLike = Union[str, T]
20
+
21
+
22
+ def serialize_column_element(expr: Union[str, ColumnElement]) -> dict: # noqa: PLR0911
23
+ """
24
+ Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
25
+ """
26
+
27
+ # Binary operations: col > 5, col1 + col2, etc.
28
+ if isinstance(expr, BinaryExpression):
29
+ op = (
30
+ expr.operator.__name__
31
+ if hasattr(expr.operator, "__name__")
32
+ else str(expr.operator)
33
+ )
34
+ return {
35
+ "type": "binary",
36
+ "op": op,
37
+ "left": serialize_column_element(expr.left),
38
+ "right": serialize_column_element(expr.right),
39
+ }
40
+
41
+ # Unary operations: -col, NOT col, etc.
42
+ if isinstance(expr, UnaryExpression):
43
+ op = (
44
+ expr.operator.__name__
45
+ if expr.operator is not None and hasattr(expr.operator, "__name__")
46
+ else str(expr.operator)
47
+ )
48
+
49
+ return {
50
+ "type": "unary",
51
+ "op": op,
52
+ "element": serialize_column_element(expr.element), # type: ignore[arg-type]
53
+ }
54
+
55
+ # Function calls: func.lower(col), func.count(col), etc.
56
+ if isinstance(expr, Function):
57
+ return {
58
+ "type": "function",
59
+ "name": expr.name,
60
+ "clauses": [serialize_column_element(c) for c in expr.clauses],
61
+ }
62
+
63
+ # Window functions: func.row_number().over(partition_by=..., order_by=...)
64
+ if isinstance(expr, Over):
65
+ return {
66
+ "type": "window",
67
+ "function": serialize_column_element(expr.element),
68
+ "partition_by": [
69
+ serialize_column_element(p) for p in getattr(expr, "partition_by", [])
70
+ ],
71
+ "order_by": [
72
+ serialize_column_element(o) for o in getattr(expr, "order_by", [])
73
+ ],
74
+ }
75
+
76
+ # Labeled expressions: col.label("alias")
77
+ if isinstance(expr, Label):
78
+ return {
79
+ "type": "label",
80
+ "name": expr.name,
81
+ "element": serialize_column_element(expr.element),
82
+ }
83
+
84
+ # Bound values (constants)
85
+ if isinstance(expr, BindParameter):
86
+ return {"type": "bind", "value": expr.value}
87
+
88
+ # Plain columns
89
+ if hasattr(expr, "name"):
90
+ return {"type": "column", "name": expr.name}
91
+
92
+ # Fallback: stringify unknown nodes
93
+ return {"type": "other", "repr": str(expr)}
94
+
95
+
96
+ def hash_column_elements(columns: Sequence[ColumnLike]) -> str:
97
+ """
98
+ Hash a list of ColumnElements deterministically, dialect agnostic.
99
+ Only accepts ordered iterables (like list or tuple).
100
+ """
101
+ serialized = [serialize_column_element(c) for c in columns]
102
+ json_str = json.dumps(serialized, sort_keys=True) # stable JSON
103
+ return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
104
+
105
+
106
+ def hash_callable(func):
107
+ """
108
+ Calculate a hash from a callable.
109
+ Rules:
110
+ - Named functions (def) → use source code for stable, cross-version hashing
111
+ - Lambdas → use bytecode (deterministic in same Python runtime)
112
+ """
113
+ if not callable(func):
114
+ raise TypeError("Expected a callable")
115
+
116
+ # Determine if it is a lambda
117
+ is_lambda = func.__name__ == "<lambda>"
118
+
119
+ if not is_lambda:
120
+ # Try to get exact source of named function
121
+ try:
122
+ lines, _ = inspect.getsourcelines(func)
123
+ payload = textwrap.dedent("".join(lines)).strip()
124
+ except (OSError, TypeError):
125
+ # Fallback: bytecode if source not available
126
+ payload = func.__code__.co_code
127
+ else:
128
+ # For lambdas, fall back directly to bytecode
129
+ payload = func.__code__.co_code
130
+
131
+ # Normalize annotations
132
+ annotations = {
133
+ k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
134
+ }
135
+
136
+ # Extras to distinguish functions with same code but different metadata
137
+ extras = {
138
+ "name": func.__name__,
139
+ "defaults": func.__defaults__,
140
+ "annotations": annotations,
141
+ }
142
+
143
+ # Compute SHA256
144
+ h = hashlib.sha256()
145
+ h.update(str(payload).encode() if isinstance(payload, str) else payload)
146
+ h.update(str(extras).encode())
147
+ return h.hexdigest()
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import hashlib
2
3
  import os
3
4
  import os.path
4
5
  import sys
@@ -18,6 +19,7 @@ from typing import (
18
19
  cast,
19
20
  overload,
20
21
  )
22
+ from uuid import uuid4
21
23
 
22
24
  import sqlalchemy
23
25
  import ujson as json
@@ -207,6 +209,14 @@ class DataChain:
207
209
  self.print_schema(file=file)
208
210
  return file.getvalue()
209
211
 
212
+ def hash(self) -> str:
213
+ """
214
+ Calculates SHA hash of this chain. Hash calculation is fast and consistent.
215
+ It takes into account all the steps added to the chain and their inputs.
216
+ Order of the steps is important.
217
+ """
218
+ return self._query.hash()
219
+
210
220
  def _as_delta(
211
221
  self,
212
222
  on: Optional[Union[str, Sequence[str]]] = None,
@@ -665,7 +675,7 @@ class DataChain:
665
675
  name, namespace=namespace_name, project=project_name, **kwargs
666
676
  )
667
677
 
668
- return self._evolve(
678
+ result = self._evolve(
669
679
  query=self._query.save(
670
680
  name=name,
671
681
  version=version,
@@ -678,6 +688,16 @@ class DataChain:
678
688
  )
679
689
  )
680
690
 
691
+ if job_id := os.getenv("DATACHAIN_JOB_ID"):
692
+ catalog.metastore.create_checkpoint(
693
+ job_id,
694
+ _hash=hashlib.sha256( # TODO this will be replaced with self.hash()
695
+ str(uuid4()).encode()
696
+ ).hexdigest(),
697
+ )
698
+
699
+ return result
700
+
681
701
  def apply(self, func, *args, **kwargs):
682
702
  """Apply any function to the chain.
683
703
 
@@ -1,4 +1,6 @@
1
1
  import copy
2
+ import hashlib
3
+ import json
2
4
  import warnings
3
5
  from collections.abc import Iterator, Sequence
4
6
  from dataclasses import dataclass
@@ -257,6 +259,11 @@ class SignalSchema:
257
259
  signals["_custom_types"] = custom_types
258
260
  return signals
259
261
 
262
+ def hash(self) -> str:
263
+ """Create SHA hash of this schema"""
264
+ json_str = json.dumps(self.serialize(), sort_keys=True, separators=(",", ":"))
265
+ return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
266
+
260
267
  @staticmethod
261
268
  def _split_subtypes(type_name: str) -> list[str]:
262
269
  """This splits a list of subtypes, including proper square bracket handling."""
datachain/lib/udf.py CHANGED
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import sys
2
3
  import traceback
3
4
  from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
@@ -12,6 +13,7 @@ from pydantic import BaseModel
12
13
  from datachain.asyn import AsyncMapper
13
14
  from datachain.cache import temporary_cache
14
15
  from datachain.dataset import RowDict
16
+ from datachain.hash_utils import hash_callable
15
17
  from datachain.lib.convert.flatten import flatten
16
18
  from datachain.lib.file import DataModel, File
17
19
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
@@ -61,6 +63,9 @@ class UDFAdapter:
61
63
  batch_size: Optional[int] = None
62
64
  batch: int = 1
63
65
 
66
+ def hash(self) -> str:
67
+ return self.inner.hash()
68
+
64
69
  def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
65
70
  if use_partitioning:
66
71
  return Partition()
@@ -151,6 +156,21 @@ class UDFBase(AbstractUDF):
151
156
  self.output = None
152
157
  self._func = None
153
158
 
159
+ def hash(self) -> str:
160
+ """
161
+ Creates SHA hash of this UDF function. It takes into account function,
162
+ inputs and outputs.
163
+ """
164
+ parts = [
165
+ hash_callable(self._func),
166
+ self.params.hash() if self.params else "",
167
+ self.output.hash(),
168
+ ]
169
+
170
+ return hashlib.sha256(
171
+ b"".join([bytes.fromhex(part) for part in parts])
172
+ ).hexdigest()
173
+
154
174
  def process(self, *args, **kwargs):
155
175
  """Processing function that needs to be defined by user"""
156
176
  if not self._func:
@@ -1,4 +1,5 @@
1
1
  import contextlib
2
+ import hashlib
2
3
  import inspect
3
4
  import logging
4
5
  import os
@@ -44,6 +45,7 @@ from datachain.data_storage.schema import (
44
45
  from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
45
46
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
46
47
  from datachain.func.base import Function
48
+ from datachain.hash_utils import hash_column_elements
47
49
  from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
48
50
  from datachain.lib.signal_schema import SignalSchema
49
51
  from datachain.lib.udf import UDFAdapter, _get_cache
@@ -57,6 +59,7 @@ from datachain.sql.types import SQLType
57
59
  from datachain.utils import (
58
60
  determine_processes,
59
61
  determine_workers,
62
+ ensure_sequence,
60
63
  filtered_cloudpickle_dumps,
61
64
  get_datachain_executable,
62
65
  safe_closing,
@@ -167,6 +170,18 @@ class Step(ABC):
167
170
  ) -> "StepResult":
168
171
  """Apply the processing step."""
169
172
 
173
+ @abstractmethod
174
+ def hash_inputs(self) -> str:
175
+ """Calculates hash of step inputs"""
176
+
177
+ def hash(self) -> str:
178
+ """
179
+ Calculates hash for step which includes step name and hash of it's inputs
180
+ """
181
+ return hashlib.sha256(
182
+ f"{self.__class__.__name__}|{self.hash_inputs()}".encode()
183
+ ).hexdigest()
184
+
170
185
 
171
186
  @frozen
172
187
  class QueryStep:
@@ -186,6 +201,11 @@ class QueryStep:
186
201
  q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
187
202
  )
188
203
 
204
+ def hash(self) -> str:
205
+ return hashlib.sha256(
206
+ self.dataset.uri(self.dataset_version).encode()
207
+ ).hexdigest()
208
+
189
209
 
190
210
  def generator_then_call(generator, func: Callable):
191
211
  """
@@ -256,6 +276,13 @@ class DatasetDiffOperation(Step):
256
276
  class Subtract(DatasetDiffOperation):
257
277
  on: Sequence[tuple[str, str]]
258
278
 
279
+ def hash_inputs(self) -> str:
280
+ on_bytes = b"".join(
281
+ f"{a}:{b}".encode() for a, b in sorted(self.on, key=lambda t: (t[0], t[1]))
282
+ )
283
+
284
+ return hashlib.sha256(bytes.fromhex(self.dq.hash()) + on_bytes).hexdigest()
285
+
259
286
  def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
260
287
  sq = source_query.alias("source_query")
261
288
  tq = target_query.alias("target_query")
@@ -393,6 +420,16 @@ class UDFStep(Step, ABC):
393
420
  min_task_size: Optional[int] = None
394
421
  batch_size: Optional[int] = None
395
422
 
423
+ def hash_inputs(self) -> str:
424
+ partition_by = ensure_sequence(self.partition_by or [])
425
+ parts = [
426
+ bytes.fromhex(self.udf.hash()),
427
+ bytes.fromhex(hash_column_elements(partition_by)),
428
+ str(self.is_generator).encode(),
429
+ ]
430
+
431
+ return hashlib.sha256(b"".join(parts)).hexdigest()
432
+
396
433
  @abstractmethod
397
434
  def create_udf_table(self, query: Select) -> "Table":
398
435
  """Method that creates a table where temp udf results will be saved"""
@@ -790,6 +827,9 @@ class SQLClause(Step, ABC):
790
827
  class SQLSelect(SQLClause):
791
828
  args: tuple[Union[Function, ColumnElement], ...]
792
829
 
830
+ def hash_inputs(self) -> str:
831
+ return hash_column_elements(self.args)
832
+
793
833
  def apply_sql_clause(self, query) -> Select:
794
834
  subquery = query.subquery()
795
835
  args = [
@@ -806,6 +846,9 @@ class SQLSelect(SQLClause):
806
846
  class SQLSelectExcept(SQLClause):
807
847
  args: tuple[Union[Function, ColumnElement], ...]
808
848
 
849
+ def hash_inputs(self) -> str:
850
+ return hash_column_elements(self.args)
851
+
809
852
  def apply_sql_clause(self, query: Select) -> Select:
810
853
  subquery = query.subquery()
811
854
  args = [c for c in subquery.c if c.name not in set(self.parse_cols(self.args))]
@@ -817,6 +860,9 @@ class SQLMutate(SQLClause):
817
860
  args: tuple[Label, ...]
818
861
  new_schema: SignalSchema
819
862
 
863
+ def hash_inputs(self) -> str:
864
+ return hash_column_elements(self.args)
865
+
820
866
  def apply_sql_clause(self, query: Select) -> Select:
821
867
  original_subquery = query.subquery()
822
868
  to_mutate = {c.name for c in self.args}
@@ -846,6 +892,9 @@ class SQLMutate(SQLClause):
846
892
  class SQLFilter(SQLClause):
847
893
  expressions: tuple[Union[Function, ColumnElement], ...]
848
894
 
895
+ def hash_inputs(self) -> str:
896
+ return hash_column_elements(self.expressions)
897
+
849
898
  def __and__(self, other):
850
899
  expressions = self.parse_cols(self.expressions)
851
900
  return self.__class__(expressions + other)
@@ -859,6 +908,9 @@ class SQLFilter(SQLClause):
859
908
  class SQLOrderBy(SQLClause):
860
909
  args: tuple[Union[Function, ColumnElement], ...]
861
910
 
911
+ def hash_inputs(self) -> str:
912
+ return hash_column_elements(self.args)
913
+
862
914
  def apply_sql_clause(self, query: Select) -> Select:
863
915
  args = self.parse_cols(self.args)
864
916
  return query.order_by(*args)
@@ -868,6 +920,9 @@ class SQLOrderBy(SQLClause):
868
920
  class SQLLimit(SQLClause):
869
921
  n: int
870
922
 
923
+ def hash_inputs(self) -> str:
924
+ return hashlib.sha256(str(self.n).encode()).hexdigest()
925
+
871
926
  def apply_sql_clause(self, query: Select) -> Select:
872
927
  return query.limit(self.n)
873
928
 
@@ -876,12 +931,18 @@ class SQLLimit(SQLClause):
876
931
  class SQLOffset(SQLClause):
877
932
  offset: int
878
933
 
934
+ def hash_inputs(self) -> str:
935
+ return hashlib.sha256(str(self.offset).encode()).hexdigest()
936
+
879
937
  def apply_sql_clause(self, query: "GenerativeSelect"):
880
938
  return query.offset(self.offset)
881
939
 
882
940
 
883
941
  @frozen
884
942
  class SQLCount(SQLClause):
943
+ def hash_inputs(self) -> str:
944
+ return ""
945
+
885
946
  def apply_sql_clause(self, query):
886
947
  return sqlalchemy.select(f.count(1)).select_from(query.subquery())
887
948
 
@@ -891,6 +952,9 @@ class SQLDistinct(SQLClause):
891
952
  args: tuple[ColumnElement, ...]
892
953
  dialect: str
893
954
 
955
+ def hash_inputs(self) -> str:
956
+ return hash_column_elements(self.args)
957
+
894
958
  def apply_sql_clause(self, query):
895
959
  if self.dialect == "sqlite":
896
960
  return query.group_by(*self.args)
@@ -903,6 +967,11 @@ class SQLUnion(Step):
903
967
  query1: "DatasetQuery"
904
968
  query2: "DatasetQuery"
905
969
 
970
+ def hash_inputs(self) -> str:
971
+ return hashlib.sha256(
972
+ bytes.fromhex(self.query1.hash()) + bytes.fromhex(self.query2.hash())
973
+ ).hexdigest()
974
+
906
975
  def apply(
907
976
  self, query_generator: QueryGenerator, temp_tables: list[str]
908
977
  ) -> StepResult:
@@ -939,6 +1008,20 @@ class SQLJoin(Step):
939
1008
  full: bool
940
1009
  rname: str
941
1010
 
1011
+ def hash_inputs(self) -> str:
1012
+ predicates = ensure_sequence(self.predicates or [])
1013
+
1014
+ parts = [
1015
+ bytes.fromhex(self.query1.hash()),
1016
+ bytes.fromhex(self.query2.hash()),
1017
+ bytes.fromhex(hash_column_elements(predicates)),
1018
+ str(self.inner).encode(),
1019
+ str(self.full).encode(),
1020
+ self.rname.encode("utf-8"),
1021
+ ]
1022
+
1023
+ return hashlib.sha256(b"".join(parts)).hexdigest()
1024
+
942
1025
  def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
943
1026
  query = dq.apply_steps().select()
944
1027
  temp_tables.extend(dq.temp_table_names)
@@ -1060,6 +1143,13 @@ class SQLGroupBy(SQLClause):
1060
1143
  cols: Sequence[Union[str, Function, ColumnElement]]
1061
1144
  group_by: Sequence[Union[str, Function, ColumnElement]]
1062
1145
 
1146
+ def hash_inputs(self) -> str:
1147
+ return hashlib.sha256(
1148
+ bytes.fromhex(
1149
+ hash_column_elements(self.cols) + hash_column_elements(self.group_by)
1150
+ )
1151
+ ).hexdigest()
1152
+
1063
1153
  def apply_sql_clause(self, query) -> Select:
1064
1154
  if not self.cols:
1065
1155
  raise ValueError("No columns to select")
@@ -1213,6 +1303,23 @@ class DatasetQuery:
1213
1303
  def __or__(self, other):
1214
1304
  return self.union(other)
1215
1305
 
1306
+ def hash(self) -> str:
1307
+ """
1308
+ Calculates hash of this class taking into account hash of starting step
1309
+ and hashes of each following steps. Ordering is important.
1310
+ """
1311
+ hasher = hashlib.sha256()
1312
+ if self.starting_step:
1313
+ hasher.update(self.starting_step.hash().encode("utf-8"))
1314
+ else:
1315
+ assert self.list_ds_name
1316
+ hasher.update(self.list_ds_name.encode("utf-8"))
1317
+
1318
+ for step in self.steps:
1319
+ hasher.update(step.hash().encode("utf-8"))
1320
+
1321
+ return hasher.hexdigest()
1322
+
1216
1323
  @staticmethod
1217
1324
  def get_table() -> "TableClause":
1218
1325
  table_name = "".join(
datachain/utils.py CHANGED
@@ -537,3 +537,9 @@ def getenv_bool(name: str, default: bool = False) -> bool:
537
537
  if val is None:
538
538
  return default
539
539
  return val.lower() in ("1", "true", "yes", "on")
540
+
541
+
542
+ def ensure_sequence(x) -> Sequence:
543
+ if isinstance(x, Sequence) and not isinstance(x, (str, bytes)):
544
+ return x
545
+ return [x]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.32.3
3
+ Version: 0.33.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -86,6 +86,7 @@ Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
86
86
  Provides-Extra: tests
87
87
  Requires-Dist: datachain[audio,hf,postgres,remote,torch,vector,video]; extra == "tests"
88
88
  Requires-Dist: pytest<9,>=8; extra == "tests"
89
+ Requires-Dist: pytest-asyncio; extra == "tests"
89
90
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
90
91
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
91
92
  Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
@@ -102,7 +103,7 @@ Requires-Dist: scipy; extra == "tests"
102
103
  Requires-Dist: ultralytics; extra == "tests"
103
104
  Provides-Extra: dev
104
105
  Requires-Dist: datachain[docs,tests]; extra == "dev"
105
- Requires-Dist: mypy==1.18.1; extra == "dev"
106
+ Requires-Dist: mypy==1.18.2; extra == "dev"
106
107
  Requires-Dist: types-python-dateutil; extra == "dev"
107
108
  Requires-Dist: types-dateparser; extra == "dev"
108
109
  Requires-Dist: types-pytz; extra == "dev"
@@ -2,10 +2,12 @@ datachain/__init__.py,sha256=BRqfLPoBRRycnndaxyba-i4ZrZCJl0As2pwV9RiNBr8,1822
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
5
+ datachain/checkpoint.py,sha256=Ar6SnnDMN3fr5ZZm3Xpdbj2f9buhqeApad-B1Lyrr4Y,1152
5
6
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
6
7
  datachain/dataset.py,sha256=eX7xGa3EUpAccBZWpkgDmYV6_FjGuhjkMLFHpjl6lVI,25256
7
8
  datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
8
- datachain/error.py,sha256=comKx1JCdjsBpxabrOWaiRP0aHBspBDZl1mkKFnBSq0,1739
9
+ datachain/error.py,sha256=WR1MoO9BPI0hO1FVKVTS0hgyxxumywtDnSY7Sv1oE1c,1796
10
+ datachain/hash_utils.py,sha256=tgyXlz1m0gsS3UkIxdb0fxtNfVsbO2-YrELtyGV5XYE,4515
9
11
  datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
10
12
  datachain/listing.py,sha256=aqayl5St3D9PwdwM6nR1STkpLSw-S3U8pudO9PWi3N8,7241
11
13
  datachain/namespace.py,sha256=sgIF90KEaC_VlMFivDIJiFz8RUsTftMxW4kOUTyxo3A,2356
@@ -19,9 +21,9 @@ datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
19
21
  datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
20
22
  datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
21
23
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
- datachain/utils.py,sha256=5ehFeqXau7MFmGUQRsjRyPfDMPoOF1ojpfVciYUo5fE,15659
24
+ datachain/utils.py,sha256=yW-Df5R6npqcqlNZMlBRBwyhUFmXpl9sQipPmy9HfQU,15797
23
25
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
24
- datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
26
+ datachain/catalog/catalog.py,sha256=oI4YBuuOJGVx_Fp1cDoFb56lPV7Or27ZquzR8oM1m3Y,69133
25
27
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
26
28
  datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
27
29
  datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
@@ -41,20 +43,21 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
41
43
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
42
44
  datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
43
45
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
44
- datachain/client/fsspec.py,sha256=sChjxu931QgU2-n9MdXlmOrhGAiAckXoDVZTxKcNv6M,14336
46
+ datachain/client/fsspec.py,sha256=urt-b9Osay-S4LmwyXUKyYp-JHUBlFewoUvYNP7W_Jw,14553
45
47
  datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
46
48
  datachain/client/hf.py,sha256=n5xJZdvNLS-SqokxuBCIPfGbhIeC_XfLm_BNYtEVvg4,2677
49
+ datachain/client/http.py,sha256=oU4nxaOa3xNXkxprDjjIS5fufgRJS0eNHTau3FUC6sg,5171
47
50
  datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,4607
48
51
  datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
49
52
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
50
53
  datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
51
- datachain/data_storage/job.py,sha256=ZkeXCNUj_VCkoKYx29hqB4AcfVUielnRjY-GYUcUxt4,426
52
- datachain/data_storage/metastore.py,sha256=SrcMeHAjzwTbX8A3WEZ3zzQzVW1n7uamrGDtQXqucyE,55810
53
- datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
54
+ datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
55
+ datachain/data_storage/metastore.py,sha256=TgLYAKraH1WsmteaAqO5TW2VzNZZM4_SASgcBlDzdr8,60218
56
+ datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
54
57
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
55
- datachain/data_storage/sqlite.py,sha256=1fIeIhmB3O8oQVzP8dDKap0KUIgI0n2TdBQSyv0R8J4,30345
58
+ datachain/data_storage/sqlite.py,sha256=Z6KlFk7hWoXBbjzxfk2NuIBecqP86AJzp5iEE2W4yw0,30603
56
59
  datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
57
- datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
60
+ datachain/diff/__init__.py,sha256=v03JfMxH1VvwFl3rniedS4YWs6EXSfaLCULJTKNECE4,9603
58
61
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
62
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
60
63
  datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
@@ -86,10 +89,10 @@ datachain/lib/namespaces.py,sha256=ZyIYUa3WMrv6R5HrSoLsmLiEbvUQDl8sBINLUmWOYG0,3
86
89
  datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
87
90
  datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
88
91
  datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
89
- datachain/lib/signal_schema.py,sha256=YMMcc9gHIzBz88zfsreGa1nOoO_56HBtZlT6jf3V1WE,39224
92
+ datachain/lib/signal_schema.py,sha256=WDFLbzXEOhgv865TePcFpLQHxsKQHtn8kTzaQGUG_XA,39479
90
93
  datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
91
94
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
92
- datachain/lib/udf.py,sha256=08ia5T3gClen5ZQfIgop-swNnys2G-RIZpszqDnbc0w,17570
95
+ datachain/lib/udf.py,sha256=DdUxGBo9Y7Jz6aTBKgwex7YfK1RNaGm1JUlXCqs7qnw,18122
93
96
  datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
94
97
  datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
95
98
  datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
@@ -104,7 +107,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
104
107
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
105
108
  datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
106
109
  datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
107
- datachain/lib/dc/datachain.py,sha256=pDgUmvmf0ENngFepoD0AkxxqiqNIgoRueejfojyuURQ,100458
110
+ datachain/lib/dc/datachain.py,sha256=FBz-IzbLeh8cS8yI2WiGBkLjV4fN7YqqqnCuuuj0S-o,101111
108
111
  datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
109
112
  datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
110
113
  datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
@@ -127,7 +130,7 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
127
130
  datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
128
131
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
129
132
  datachain/query/batch.py,sha256=ocPeNgrJM6Y_6SYCx3O2cwlCFAhNMfoYgB99GP6A1Bg,4294
130
- datachain/query/dataset.py,sha256=1eg5EE4vKI7c_Ng04or6zzKmFcOoEubMCoOaYmYPavE,64499
133
+ datachain/query/dataset.py,sha256=P7pyRiWc9G3AfzxvyB2yToKW3bXoUCrfFOtFdiVbCrU,67836
131
134
  datachain/query/dispatch.py,sha256=pygp7xg3lUDKlYHhecKxW5fB3zOSX1fPJfZBU4dfijk,16067
132
135
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
133
136
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -161,9 +164,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
161
164
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
162
165
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
163
166
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
164
- datachain-0.32.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
165
- datachain-0.32.3.dist-info/METADATA,sha256=MJCn0xaCu7eOuQl8AXKTFX4HTvPqtBPY93rCvcUcoBg,13607
166
- datachain-0.32.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
167
- datachain-0.32.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
168
- datachain-0.32.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
169
- datachain-0.32.3.dist-info/RECORD,,
167
+ datachain-0.33.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
168
+ datachain-0.33.1.dist-info/METADATA,sha256=1D-XqF5TtHydJqpLRIRpld9UKQftLhw_RkDUjI_NE2c,13655
169
+ datachain-0.33.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
170
+ datachain-0.33.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
171
+ datachain-0.33.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
172
+ datachain-0.33.1.dist-info/RECORD,,