datachain 0.35.1__py3-none-any.whl → 0.36.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +103 -42
- datachain/catalog/dependency.py +164 -0
- datachain/data_storage/metastore.py +80 -0
- {datachain-0.35.1.dist-info → datachain-0.36.0.dist-info}/METADATA +1 -1
- {datachain-0.35.1.dist-info → datachain-0.36.0.dist-info}/RECORD +9 -8
- {datachain-0.35.1.dist-info → datachain-0.36.0.dist-info}/WHEEL +0 -0
- {datachain-0.35.1.dist-info → datachain-0.36.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.35.1.dist-info → datachain-0.36.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.35.1.dist-info → datachain-0.36.0.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -54,6 +54,7 @@ from datachain.sql.types import DateTime, SQLType
|
|
|
54
54
|
from datachain.utils import DataChainDir
|
|
55
55
|
|
|
56
56
|
from .datasource import DataSource
|
|
57
|
+
from .dependency import build_dependency_hierarchy, populate_nested_dependencies
|
|
57
58
|
|
|
58
59
|
if TYPE_CHECKING:
|
|
59
60
|
from datachain.data_storage import AbstractMetastore, AbstractWarehouse
|
|
@@ -133,19 +134,26 @@ def shutdown_process(
|
|
|
133
134
|
return proc.wait()
|
|
134
135
|
|
|
135
136
|
|
|
136
|
-
def
|
|
137
|
+
def process_output(stream: IO[bytes], callback: Callable[[str], None]) -> None:
|
|
137
138
|
buffer = b""
|
|
138
|
-
while byt := stream.read(1): # Read one byte at a time
|
|
139
|
-
buffer += byt
|
|
140
139
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
buffer = b"" # Clear buffer for next line
|
|
140
|
+
try:
|
|
141
|
+
while byt := stream.read(1): # Read one byte at a time
|
|
142
|
+
buffer += byt
|
|
145
143
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
144
|
+
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
145
|
+
line = buffer.decode("utf-8", errors="replace")
|
|
146
|
+
callback(line)
|
|
147
|
+
buffer = b"" # Clear buffer for the next line
|
|
148
|
+
|
|
149
|
+
if buffer: # Handle any remaining data in the buffer
|
|
150
|
+
line = buffer.decode("utf-8", errors="replace")
|
|
151
|
+
callback(line)
|
|
152
|
+
finally:
|
|
153
|
+
try:
|
|
154
|
+
stream.close() # Ensure output is closed
|
|
155
|
+
except Exception: # noqa: BLE001, S110
|
|
156
|
+
pass
|
|
149
157
|
|
|
150
158
|
|
|
151
159
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -1196,6 +1204,38 @@ class Catalog:
|
|
|
1196
1204
|
assert isinstance(dataset_info, dict)
|
|
1197
1205
|
return DatasetRecord.from_dict(dataset_info)
|
|
1198
1206
|
|
|
1207
|
+
def get_dataset_dependencies_by_ids(
|
|
1208
|
+
self,
|
|
1209
|
+
dataset_id: int,
|
|
1210
|
+
version_id: int,
|
|
1211
|
+
indirect: bool = True,
|
|
1212
|
+
) -> list[DatasetDependency | None]:
|
|
1213
|
+
dependency_nodes = self.metastore.get_dataset_dependency_nodes(
|
|
1214
|
+
dataset_id=dataset_id,
|
|
1215
|
+
version_id=version_id,
|
|
1216
|
+
)
|
|
1217
|
+
|
|
1218
|
+
if not dependency_nodes:
|
|
1219
|
+
return []
|
|
1220
|
+
|
|
1221
|
+
dependency_map, children_map = build_dependency_hierarchy(dependency_nodes)
|
|
1222
|
+
|
|
1223
|
+
root_key = (dataset_id, version_id)
|
|
1224
|
+
if root_key not in children_map:
|
|
1225
|
+
return []
|
|
1226
|
+
|
|
1227
|
+
root_dependency_ids = children_map[root_key]
|
|
1228
|
+
root_dependencies = [dependency_map[dep_id] for dep_id in root_dependency_ids]
|
|
1229
|
+
|
|
1230
|
+
if indirect:
|
|
1231
|
+
for dependency in root_dependencies:
|
|
1232
|
+
if dependency is not None:
|
|
1233
|
+
populate_nested_dependencies(
|
|
1234
|
+
dependency, dependency_nodes, dependency_map, children_map
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
return root_dependencies
|
|
1238
|
+
|
|
1199
1239
|
def get_dataset_dependencies(
|
|
1200
1240
|
self,
|
|
1201
1241
|
name: str,
|
|
@@ -1209,29 +1249,21 @@ class Catalog:
|
|
|
1209
1249
|
namespace_name=namespace_name,
|
|
1210
1250
|
project_name=project_name,
|
|
1211
1251
|
)
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
)
|
|
1252
|
+
dataset_version = dataset.get_version(version)
|
|
1253
|
+
dataset_id = dataset.id
|
|
1254
|
+
dataset_version_id = dataset_version.id
|
|
1216
1255
|
|
|
1217
1256
|
if not indirect:
|
|
1218
|
-
return
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
# dependency has been removed
|
|
1223
|
-
continue
|
|
1224
|
-
if d.is_dataset:
|
|
1225
|
-
# only datasets can have dependencies
|
|
1226
|
-
d.dependencies = self.get_dataset_dependencies(
|
|
1227
|
-
d.name,
|
|
1228
|
-
d.version,
|
|
1229
|
-
namespace_name=d.namespace,
|
|
1230
|
-
project_name=d.project,
|
|
1231
|
-
indirect=indirect,
|
|
1232
|
-
)
|
|
1257
|
+
return self.metastore.get_direct_dataset_dependencies(
|
|
1258
|
+
dataset,
|
|
1259
|
+
version,
|
|
1260
|
+
)
|
|
1233
1261
|
|
|
1234
|
-
return
|
|
1262
|
+
return self.get_dataset_dependencies_by_ids(
|
|
1263
|
+
dataset_id,
|
|
1264
|
+
dataset_version_id,
|
|
1265
|
+
indirect,
|
|
1266
|
+
)
|
|
1235
1267
|
|
|
1236
1268
|
def ls_datasets(
|
|
1237
1269
|
self,
|
|
@@ -1747,13 +1779,13 @@ class Catalog:
|
|
|
1747
1779
|
recursive=recursive,
|
|
1748
1780
|
)
|
|
1749
1781
|
|
|
1782
|
+
@staticmethod
|
|
1750
1783
|
def query(
|
|
1751
|
-
self,
|
|
1752
1784
|
query_script: str,
|
|
1753
1785
|
env: Mapping[str, str] | None = None,
|
|
1754
1786
|
python_executable: str = sys.executable,
|
|
1755
|
-
|
|
1756
|
-
|
|
1787
|
+
stdout_callback: Callable[[str], None] | None = None,
|
|
1788
|
+
stderr_callback: Callable[[str], None] | None = None,
|
|
1757
1789
|
params: dict[str, str] | None = None,
|
|
1758
1790
|
job_id: str | None = None,
|
|
1759
1791
|
reset: bool = False,
|
|
@@ -1773,13 +1805,18 @@ class Catalog:
|
|
|
1773
1805
|
},
|
|
1774
1806
|
)
|
|
1775
1807
|
popen_kwargs: dict[str, Any] = {}
|
|
1776
|
-
|
|
1777
|
-
|
|
1808
|
+
|
|
1809
|
+
if stdout_callback is not None:
|
|
1810
|
+
popen_kwargs = {"stdout": subprocess.PIPE}
|
|
1811
|
+
if stderr_callback is not None:
|
|
1812
|
+
popen_kwargs["stderr"] = subprocess.PIPE
|
|
1778
1813
|
|
|
1779
1814
|
def raise_termination_signal(sig: int, _: Any) -> NoReturn:
|
|
1780
1815
|
raise TerminationSignal(sig)
|
|
1781
1816
|
|
|
1782
|
-
|
|
1817
|
+
stdout_thread: Thread | None = None
|
|
1818
|
+
stderr_thread: Thread | None = None
|
|
1819
|
+
|
|
1783
1820
|
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
|
|
1784
1821
|
logger.info("Starting process %s", proc.pid)
|
|
1785
1822
|
|
|
@@ -1793,10 +1830,20 @@ class Catalog:
|
|
|
1793
1830
|
orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
1794
1831
|
signal.signal(signal.SIGTERM, raise_termination_signal)
|
|
1795
1832
|
try:
|
|
1796
|
-
if
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1833
|
+
if stdout_callback is not None:
|
|
1834
|
+
stdout_thread = Thread(
|
|
1835
|
+
target=process_output,
|
|
1836
|
+
args=(proc.stdout, stdout_callback),
|
|
1837
|
+
daemon=True,
|
|
1838
|
+
)
|
|
1839
|
+
stdout_thread.start()
|
|
1840
|
+
if stderr_callback is not None:
|
|
1841
|
+
stderr_thread = Thread(
|
|
1842
|
+
target=process_output,
|
|
1843
|
+
args=(proc.stderr, stderr_callback),
|
|
1844
|
+
daemon=True,
|
|
1845
|
+
)
|
|
1846
|
+
stderr_thread.start()
|
|
1800
1847
|
|
|
1801
1848
|
proc.wait()
|
|
1802
1849
|
except TerminationSignal as exc:
|
|
@@ -1814,8 +1861,22 @@ class Catalog:
|
|
|
1814
1861
|
finally:
|
|
1815
1862
|
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1816
1863
|
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1817
|
-
|
|
1818
|
-
|
|
1864
|
+
# wait for the reader thread
|
|
1865
|
+
thread_join_timeout_seconds = 30
|
|
1866
|
+
if stdout_thread is not None:
|
|
1867
|
+
stdout_thread.join(timeout=thread_join_timeout_seconds)
|
|
1868
|
+
if stdout_thread.is_alive():
|
|
1869
|
+
logger.warning(
|
|
1870
|
+
"stdout thread is still alive after %s seconds",
|
|
1871
|
+
thread_join_timeout_seconds,
|
|
1872
|
+
)
|
|
1873
|
+
if stderr_thread is not None:
|
|
1874
|
+
stderr_thread.join(timeout=thread_join_timeout_seconds)
|
|
1875
|
+
if stderr_thread.is_alive():
|
|
1876
|
+
logger.warning(
|
|
1877
|
+
"stderr thread is still alive after %s seconds",
|
|
1878
|
+
thread_join_timeout_seconds,
|
|
1879
|
+
)
|
|
1819
1880
|
|
|
1820
1881
|
logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1821
1882
|
if proc.returncode in (
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import builtins
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import TypeVar
|
|
5
|
+
|
|
6
|
+
from datachain.dataset import DatasetDependency
|
|
7
|
+
|
|
8
|
+
DDN = TypeVar("DDN", bound="DatasetDependencyNode")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DatasetDependencyNode:
|
|
13
|
+
namespace: str
|
|
14
|
+
project: str
|
|
15
|
+
id: int
|
|
16
|
+
dataset_id: int | None
|
|
17
|
+
dataset_version_id: int | None
|
|
18
|
+
dataset_name: str | None
|
|
19
|
+
dataset_version: str | None
|
|
20
|
+
created_at: datetime
|
|
21
|
+
source_dataset_id: int
|
|
22
|
+
source_dataset_version_id: int | None
|
|
23
|
+
depth: int
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def parse(
|
|
27
|
+
cls: builtins.type[DDN],
|
|
28
|
+
namespace: str,
|
|
29
|
+
project: str,
|
|
30
|
+
id: int,
|
|
31
|
+
dataset_id: int | None,
|
|
32
|
+
dataset_version_id: int | None,
|
|
33
|
+
dataset_name: str | None,
|
|
34
|
+
dataset_version: str | None,
|
|
35
|
+
created_at: datetime,
|
|
36
|
+
source_dataset_id: int,
|
|
37
|
+
source_dataset_version_id: int | None,
|
|
38
|
+
depth: int,
|
|
39
|
+
) -> "DatasetDependencyNode | None":
|
|
40
|
+
return cls(
|
|
41
|
+
namespace,
|
|
42
|
+
project,
|
|
43
|
+
id,
|
|
44
|
+
dataset_id,
|
|
45
|
+
dataset_version_id,
|
|
46
|
+
dataset_name,
|
|
47
|
+
dataset_version,
|
|
48
|
+
created_at,
|
|
49
|
+
source_dataset_id,
|
|
50
|
+
source_dataset_version_id,
|
|
51
|
+
depth,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def to_dependency(self) -> "DatasetDependency | None":
|
|
55
|
+
return DatasetDependency.parse(
|
|
56
|
+
namespace_name=self.namespace,
|
|
57
|
+
project_name=self.project,
|
|
58
|
+
id=self.id,
|
|
59
|
+
dataset_id=self.dataset_id,
|
|
60
|
+
dataset_version_id=self.dataset_version_id,
|
|
61
|
+
dataset_name=self.dataset_name,
|
|
62
|
+
dataset_version=self.dataset_version,
|
|
63
|
+
dataset_version_created_at=self.created_at,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def build_dependency_hierarchy(
|
|
68
|
+
dependency_nodes: list[DatasetDependencyNode | None],
|
|
69
|
+
) -> tuple[
|
|
70
|
+
dict[int, DatasetDependency | None], dict[tuple[int, int | None], list[int]]
|
|
71
|
+
]:
|
|
72
|
+
"""
|
|
73
|
+
Build dependency hierarchy from dependency nodes.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
dependency_nodes: List of DatasetDependencyNode objects from the database
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Tuple of (dependency_map, children_map) where:
|
|
80
|
+
- dependency_map: Maps dependency_id -> DatasetDependency
|
|
81
|
+
- children_map: Maps (source_dataset_id, source_version_id) ->
|
|
82
|
+
list of dependency_ids
|
|
83
|
+
"""
|
|
84
|
+
dependency_map: dict[int, DatasetDependency | None] = {}
|
|
85
|
+
children_map: dict[tuple[int, int | None], list[int]] = {}
|
|
86
|
+
|
|
87
|
+
for node in dependency_nodes:
|
|
88
|
+
if node is None:
|
|
89
|
+
continue
|
|
90
|
+
dependency = node.to_dependency()
|
|
91
|
+
parent_key = (node.source_dataset_id, node.source_dataset_version_id)
|
|
92
|
+
|
|
93
|
+
if dependency is not None:
|
|
94
|
+
dependency_map[dependency.id] = dependency
|
|
95
|
+
children_map.setdefault(parent_key, []).append(dependency.id)
|
|
96
|
+
else:
|
|
97
|
+
# Handle case where dependency creation failed (e.g., deleted dependency)
|
|
98
|
+
dependency_map[node.id] = None
|
|
99
|
+
children_map.setdefault(parent_key, []).append(node.id)
|
|
100
|
+
|
|
101
|
+
return dependency_map, children_map
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def populate_nested_dependencies(
|
|
105
|
+
dependency: DatasetDependency,
|
|
106
|
+
dependency_nodes: list[DatasetDependencyNode | None],
|
|
107
|
+
dependency_map: dict[int, DatasetDependency | None],
|
|
108
|
+
children_map: dict[tuple[int, int | None], list[int]],
|
|
109
|
+
) -> None:
|
|
110
|
+
"""
|
|
111
|
+
Recursively populate nested dependencies for a given dependency.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
dependency: The dependency to populate nested dependencies for
|
|
115
|
+
dependency_nodes: All dependency nodes from the database
|
|
116
|
+
dependency_map: Maps dependency_id -> DatasetDependency
|
|
117
|
+
children_map: Maps (source_dataset_id, source_version_id) ->
|
|
118
|
+
list of dependency_ids
|
|
119
|
+
"""
|
|
120
|
+
# Find the target dataset and version for this dependency
|
|
121
|
+
target_dataset_id, target_version_id = find_target_dataset_version(
|
|
122
|
+
dependency, dependency_nodes
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if target_dataset_id is None or target_version_id is None:
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
# Get children for this target
|
|
129
|
+
target_key = (target_dataset_id, target_version_id)
|
|
130
|
+
if target_key not in children_map:
|
|
131
|
+
dependency.dependencies = []
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
child_dependency_ids = children_map[target_key]
|
|
135
|
+
child_dependencies = [dependency_map[child_id] for child_id in child_dependency_ids]
|
|
136
|
+
|
|
137
|
+
dependency.dependencies = child_dependencies
|
|
138
|
+
|
|
139
|
+
# Recursively populate children
|
|
140
|
+
for child_dependency in child_dependencies:
|
|
141
|
+
if child_dependency is not None:
|
|
142
|
+
populate_nested_dependencies(
|
|
143
|
+
child_dependency, dependency_nodes, dependency_map, children_map
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def find_target_dataset_version(
|
|
148
|
+
dependency: DatasetDependency,
|
|
149
|
+
dependency_nodes: list[DatasetDependencyNode | None],
|
|
150
|
+
) -> tuple[int | None, int | None]:
|
|
151
|
+
"""
|
|
152
|
+
Find the target dataset ID and version ID for a given dependency.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
dependency: The dependency to find target for
|
|
156
|
+
dependency_nodes: All dependency nodes from the database
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Tuple of (target_dataset_id, target_version_id) or (None, None) if not found
|
|
160
|
+
"""
|
|
161
|
+
for node in dependency_nodes:
|
|
162
|
+
if node is not None and node.id == dependency.id:
|
|
163
|
+
return node.dataset_id, node.dataset_version_id
|
|
164
|
+
return None, None
|
|
@@ -22,10 +22,12 @@ from sqlalchemy import (
|
|
|
22
22
|
Text,
|
|
23
23
|
UniqueConstraint,
|
|
24
24
|
desc,
|
|
25
|
+
literal,
|
|
25
26
|
select,
|
|
26
27
|
)
|
|
27
28
|
from sqlalchemy.sql import func as f
|
|
28
29
|
|
|
30
|
+
from datachain.catalog.dependency import DatasetDependencyNode
|
|
29
31
|
from datachain.checkpoint import Checkpoint
|
|
30
32
|
from datachain.data_storage import JobQueryType, JobStatus
|
|
31
33
|
from datachain.data_storage.serializer import Serializable
|
|
@@ -78,6 +80,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
78
80
|
dataset_list_class: type[DatasetListRecord] = DatasetListRecord
|
|
79
81
|
dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
|
|
80
82
|
dependency_class: type[DatasetDependency] = DatasetDependency
|
|
83
|
+
dependency_node_class: type[DatasetDependencyNode] = DatasetDependencyNode
|
|
81
84
|
job_class: type[Job] = Job
|
|
82
85
|
checkpoint_class: type[Checkpoint] = Checkpoint
|
|
83
86
|
|
|
@@ -366,6 +369,12 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
366
369
|
) -> list[DatasetDependency | None]:
|
|
367
370
|
"""Gets direct dataset dependencies."""
|
|
368
371
|
|
|
372
|
+
@abstractmethod
|
|
373
|
+
def get_dataset_dependency_nodes(
|
|
374
|
+
self, dataset_id: int, version_id: int
|
|
375
|
+
) -> list[DatasetDependencyNode | None]:
|
|
376
|
+
"""Gets dataset dependency node from database."""
|
|
377
|
+
|
|
369
378
|
@abstractmethod
|
|
370
379
|
def remove_dataset_dependencies(
|
|
371
380
|
self, dataset: DatasetRecord, version: str | None = None
|
|
@@ -1483,6 +1492,77 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1483
1492
|
|
|
1484
1493
|
return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
|
|
1485
1494
|
|
|
1495
|
+
def get_dataset_dependency_nodes(
|
|
1496
|
+
self, dataset_id: int, version_id: int
|
|
1497
|
+
) -> list[DatasetDependencyNode | None]:
|
|
1498
|
+
n = self._namespaces_select().subquery()
|
|
1499
|
+
p = self._projects
|
|
1500
|
+
d = self._datasets_select().subquery()
|
|
1501
|
+
dd = self._datasets_dependencies
|
|
1502
|
+
dv = self._datasets_versions
|
|
1503
|
+
|
|
1504
|
+
# Common dependency fields for CTE
|
|
1505
|
+
dep_fields = [
|
|
1506
|
+
dd.c.id,
|
|
1507
|
+
dd.c.source_dataset_id,
|
|
1508
|
+
dd.c.source_dataset_version_id,
|
|
1509
|
+
dd.c.dataset_id,
|
|
1510
|
+
dd.c.dataset_version_id,
|
|
1511
|
+
]
|
|
1512
|
+
|
|
1513
|
+
# Base case: direct dependencies
|
|
1514
|
+
base_query = select(
|
|
1515
|
+
*dep_fields,
|
|
1516
|
+
literal(0).label("depth"),
|
|
1517
|
+
).where(
|
|
1518
|
+
(dd.c.source_dataset_id == dataset_id)
|
|
1519
|
+
& (dd.c.source_dataset_version_id == version_id)
|
|
1520
|
+
)
|
|
1521
|
+
|
|
1522
|
+
cte = base_query.cte(name="dependency_tree", recursive=True)
|
|
1523
|
+
|
|
1524
|
+
# Recursive case: dependencies of dependencies
|
|
1525
|
+
recursive_query = select(
|
|
1526
|
+
*dep_fields,
|
|
1527
|
+
(cte.c.depth + 1).label("depth"),
|
|
1528
|
+
).select_from(
|
|
1529
|
+
cte.join(
|
|
1530
|
+
dd,
|
|
1531
|
+
(cte.c.dataset_id == dd.c.source_dataset_id)
|
|
1532
|
+
& (cte.c.dataset_version_id == dd.c.source_dataset_version_id),
|
|
1533
|
+
)
|
|
1534
|
+
)
|
|
1535
|
+
|
|
1536
|
+
cte = cte.union(recursive_query)
|
|
1537
|
+
|
|
1538
|
+
# Fetch all with full details
|
|
1539
|
+
final_query = select(
|
|
1540
|
+
n.c.name,
|
|
1541
|
+
p.c.name,
|
|
1542
|
+
cte.c.id,
|
|
1543
|
+
cte.c.dataset_id,
|
|
1544
|
+
cte.c.dataset_version_id,
|
|
1545
|
+
d.c.name,
|
|
1546
|
+
dv.c.version,
|
|
1547
|
+
dv.c.created_at,
|
|
1548
|
+
cte.c.source_dataset_id,
|
|
1549
|
+
cte.c.source_dataset_version_id,
|
|
1550
|
+
cte.c.depth,
|
|
1551
|
+
).select_from(
|
|
1552
|
+
# Use outer joins to handle cases where dependent datasets have been
|
|
1553
|
+
# physically deleted. This allows us to return dependency records with
|
|
1554
|
+
# None values instead of silently omitting them, making broken
|
|
1555
|
+
# dependencies visible to callers.
|
|
1556
|
+
cte.join(d, cte.c.dataset_id == d.c.id, isouter=True)
|
|
1557
|
+
.join(dv, cte.c.dataset_version_id == dv.c.id, isouter=True)
|
|
1558
|
+
.join(p, d.c.project_id == p.c.id, isouter=True)
|
|
1559
|
+
.join(n, p.c.namespace_id == n.c.id, isouter=True)
|
|
1560
|
+
)
|
|
1561
|
+
|
|
1562
|
+
return [
|
|
1563
|
+
self.dependency_node_class.parse(*r) for r in self.db.execute(final_query)
|
|
1564
|
+
]
|
|
1565
|
+
|
|
1486
1566
|
def remove_dataset_dependencies(
|
|
1487
1567
|
self, dataset: DatasetRecord, version: str | None = None
|
|
1488
1568
|
) -> None:
|
|
@@ -24,8 +24,9 @@ datachain/studio.py,sha256=OHVAY8IcktgEHNSgYaJuBfAIln_nKBrF2j7BOM2Fxd0,15177
|
|
|
24
24
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
25
25
|
datachain/utils.py,sha256=9KXA-fRH8lhK4E2JmdNOOH-74aUe-Sjb8wLiTiqXOh8,15710
|
|
26
26
|
datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
|
|
27
|
-
datachain/catalog/catalog.py,sha256=
|
|
27
|
+
datachain/catalog/catalog.py,sha256=Bb5xvC-qIGdUz_-epiFT9Eq6c3e00ZtNh_qFKyI_bp0,69862
|
|
28
28
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
29
|
+
datachain/catalog/dependency.py,sha256=EHuu_Ox76sEhy71NXjFJiHxQVTz19KecqBcrjwFCa7M,5280
|
|
29
30
|
datachain/catalog/loader.py,sha256=VTaGPc4ASNdUdr7Elobp8qcXUOHwd0oqQcnk3LUwtF0,6244
|
|
30
31
|
datachain/cli/__init__.py,sha256=y7wfBmKiBwPJiIOhoeIOXXBWankYbjknm6OnauEPQxM,8203
|
|
31
32
|
datachain/cli/utils.py,sha256=WAeK_DSWGsYAYp58P4C9EYuAlfbUjW8PI0wh3TCfNUo,3005
|
|
@@ -53,7 +54,7 @@ datachain/client/s3.py,sha256=KS9o0jxXJRFp7Isdibz366VaWrULmpegzfYdurJpAl0,7499
|
|
|
53
54
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
54
55
|
datachain/data_storage/db_engine.py,sha256=MGbrckXk5kHOfpjnhHhGpyJpAsgaBCxMmfd33hB2SWI,3756
|
|
55
56
|
datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
|
|
56
|
-
datachain/data_storage/metastore.py,sha256=
|
|
57
|
+
datachain/data_storage/metastore.py,sha256=NLGYLErWFUNXjKbEoESFkKW222MQdMCBlpuqaYVugsE,63484
|
|
57
58
|
datachain/data_storage/schema.py,sha256=4FZZFgPTI9e3gUFdlm1smPdES7FHctwXQNdNfY69tj8,9807
|
|
58
59
|
datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
|
|
59
60
|
datachain/data_storage/sqlite.py,sha256=xQZ944neP57K_25HSetIy35IakAcyA0cUKVe-xeIEgQ,31168
|
|
@@ -164,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
164
165
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
165
166
|
datachain/toolkit/split.py,sha256=xQzzmvQRKsPteDKbpgOxd4r971BnFaK33mcOl0FuGeI,2883
|
|
166
167
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
167
|
-
datachain-0.
|
|
168
|
-
datachain-0.
|
|
169
|
-
datachain-0.
|
|
170
|
-
datachain-0.
|
|
171
|
-
datachain-0.
|
|
172
|
-
datachain-0.
|
|
168
|
+
datachain-0.36.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
169
|
+
datachain-0.36.0.dist-info/METADATA,sha256=ZH1x0Zcl8YD035rT1qvKm3D_NnSRgGtnD0TP2FNlwgI,13606
|
|
170
|
+
datachain-0.36.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
171
|
+
datachain-0.36.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
172
|
+
datachain-0.36.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
173
|
+
datachain-0.36.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|