datachain 0.36.1__py3-none-any.whl → 0.36.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/data_storage/metastore.py +35 -23
- datachain/data_storage/sqlite.py +25 -1
- {datachain-0.36.1.dist-info → datachain-0.36.3.dist-info}/METADATA +2 -1
- {datachain-0.36.1.dist-info → datachain-0.36.3.dist-info}/RECORD +8 -8
- {datachain-0.36.1.dist-info → datachain-0.36.3.dist-info}/WHEEL +0 -0
- {datachain-0.36.1.dist-info → datachain-0.36.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.36.1.dist-info → datachain-0.36.3.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.36.1.dist-info → datachain-0.36.3.dist-info}/top_level.txt +0 -0
|
@@ -56,13 +56,15 @@ from datachain.project import Project
|
|
|
56
56
|
from datachain.utils import JSONSerialize
|
|
57
57
|
|
|
58
58
|
if TYPE_CHECKING:
|
|
59
|
-
from sqlalchemy import Delete, Insert, Select, Update
|
|
59
|
+
from sqlalchemy import CTE, Delete, Insert, Select, Subquery, Update
|
|
60
60
|
from sqlalchemy.schema import SchemaItem
|
|
61
|
+
from sqlalchemy.sql.elements import ColumnElement
|
|
61
62
|
|
|
62
63
|
from datachain.data_storage import schema
|
|
63
64
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
64
65
|
|
|
65
66
|
logger = logging.getLogger("datachain")
|
|
67
|
+
DEPTH_LIMIT_DEFAULT = 100
|
|
66
68
|
|
|
67
69
|
|
|
68
70
|
class AbstractMetastore(ABC, Serializable):
|
|
@@ -1463,6 +1465,18 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1463
1465
|
Returns a list of columns to select in a query for fetching dataset dependencies
|
|
1464
1466
|
"""
|
|
1465
1467
|
|
|
1468
|
+
@abstractmethod
|
|
1469
|
+
def _dataset_dependency_nodes_select_columns(
|
|
1470
|
+
self,
|
|
1471
|
+
namespaces_subquery: "Subquery",
|
|
1472
|
+
dependency_tree_cte: "CTE",
|
|
1473
|
+
datasets_subquery: "Subquery",
|
|
1474
|
+
) -> list["ColumnElement"]:
|
|
1475
|
+
"""
|
|
1476
|
+
Returns a list of columns to select in a query for fetching
|
|
1477
|
+
dataset dependency nodes.
|
|
1478
|
+
"""
|
|
1479
|
+
|
|
1466
1480
|
def get_direct_dataset_dependencies(
|
|
1467
1481
|
self, dataset: DatasetRecord, version: str
|
|
1468
1482
|
) -> list[DatasetDependency | None]:
|
|
@@ -1493,7 +1507,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1493
1507
|
return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
|
|
1494
1508
|
|
|
1495
1509
|
def get_dataset_dependency_nodes(
|
|
1496
|
-
self, dataset_id: int, version_id: int
|
|
1510
|
+
self, dataset_id: int, version_id: int, depth_limit: int = DEPTH_LIMIT_DEFAULT
|
|
1497
1511
|
) -> list[DatasetDependencyNode | None]:
|
|
1498
1512
|
n = self._namespaces_select().subquery()
|
|
1499
1513
|
p = self._projects
|
|
@@ -1522,33 +1536,31 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1522
1536
|
cte = base_query.cte(name="dependency_tree", recursive=True)
|
|
1523
1537
|
|
|
1524
1538
|
# Recursive case: dependencies of dependencies
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
(
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
dd,
|
|
1531
|
-
(cte.c.dataset_id == dd.c.source_dataset_id)
|
|
1532
|
-
& (cte.c.dataset_version_id == dd.c.source_dataset_version_id),
|
|
1539
|
+
# Limit depth to 100 to prevent infinite loops in case of circular dependencies
|
|
1540
|
+
recursive_query = (
|
|
1541
|
+
select(
|
|
1542
|
+
*dep_fields,
|
|
1543
|
+
(cte.c.depth + 1).label("depth"),
|
|
1533
1544
|
)
|
|
1545
|
+
.select_from(
|
|
1546
|
+
cte.join(
|
|
1547
|
+
dd,
|
|
1548
|
+
(cte.c.dataset_id == dd.c.source_dataset_id)
|
|
1549
|
+
& (cte.c.dataset_version_id == dd.c.source_dataset_version_id),
|
|
1550
|
+
)
|
|
1551
|
+
)
|
|
1552
|
+
.where(cte.c.depth < depth_limit)
|
|
1534
1553
|
)
|
|
1535
1554
|
|
|
1536
1555
|
cte = cte.union(recursive_query)
|
|
1537
1556
|
|
|
1538
1557
|
# Fetch all with full details
|
|
1539
|
-
|
|
1540
|
-
n
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
d.c.name,
|
|
1546
|
-
dv.c.version,
|
|
1547
|
-
dv.c.created_at,
|
|
1548
|
-
cte.c.source_dataset_id,
|
|
1549
|
-
cte.c.source_dataset_version_id,
|
|
1550
|
-
cte.c.depth,
|
|
1551
|
-
).select_from(
|
|
1558
|
+
select_cols = self._dataset_dependency_nodes_select_columns(
|
|
1559
|
+
namespaces_subquery=n,
|
|
1560
|
+
dependency_tree_cte=cte,
|
|
1561
|
+
datasets_subquery=d,
|
|
1562
|
+
)
|
|
1563
|
+
final_query = self._datasets_dependencies_select(*select_cols).select_from(
|
|
1552
1564
|
# Use outer joins to handle cases where dependent datasets have been
|
|
1553
1565
|
# physically deleted. This allows us to return dependency records with
|
|
1554
1566
|
# None values instead of silently omitting them, making broken
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -20,7 +20,10 @@ from sqlalchemy import (
|
|
|
20
20
|
from sqlalchemy.dialects import sqlite
|
|
21
21
|
from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
|
|
22
22
|
from sqlalchemy.sql import func
|
|
23
|
-
from sqlalchemy.sql.elements import
|
|
23
|
+
from sqlalchemy.sql.elements import (
|
|
24
|
+
BinaryExpression,
|
|
25
|
+
BooleanClauseList,
|
|
26
|
+
)
|
|
24
27
|
from sqlalchemy.sql.expression import bindparam, cast
|
|
25
28
|
from sqlalchemy.sql.selectable import Select
|
|
26
29
|
from tqdm.auto import tqdm
|
|
@@ -41,6 +44,7 @@ from datachain.sql.types import SQLType
|
|
|
41
44
|
from datachain.utils import DataChainDir, batched, batched_it
|
|
42
45
|
|
|
43
46
|
if TYPE_CHECKING:
|
|
47
|
+
from sqlalchemy import CTE, Subquery
|
|
44
48
|
from sqlalchemy.dialects.sqlite import Insert
|
|
45
49
|
from sqlalchemy.engine.base import Engine
|
|
46
50
|
from sqlalchemy.schema import SchemaItem
|
|
@@ -539,6 +543,26 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
539
543
|
self._datasets_versions.c.created_at,
|
|
540
544
|
]
|
|
541
545
|
|
|
546
|
+
def _dataset_dependency_nodes_select_columns(
|
|
547
|
+
self,
|
|
548
|
+
namespaces_subquery: "Subquery",
|
|
549
|
+
dependency_tree_cte: "CTE",
|
|
550
|
+
datasets_subquery: "Subquery",
|
|
551
|
+
) -> list["ColumnElement"]:
|
|
552
|
+
return [
|
|
553
|
+
namespaces_subquery.c.name,
|
|
554
|
+
self._projects.c.name,
|
|
555
|
+
dependency_tree_cte.c.id,
|
|
556
|
+
dependency_tree_cte.c.dataset_id,
|
|
557
|
+
dependency_tree_cte.c.dataset_version_id,
|
|
558
|
+
datasets_subquery.c.name,
|
|
559
|
+
self._datasets_versions.c.version,
|
|
560
|
+
self._datasets_versions.c.created_at,
|
|
561
|
+
dependency_tree_cte.c.source_dataset_id,
|
|
562
|
+
dependency_tree_cte.c.source_dataset_version_id,
|
|
563
|
+
dependency_tree_cte.c.depth,
|
|
564
|
+
]
|
|
565
|
+
|
|
542
566
|
#
|
|
543
567
|
# Jobs
|
|
544
568
|
#
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.36.
|
|
3
|
+
Version: 0.36.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -76,6 +76,7 @@ Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
|
|
|
76
76
|
Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
|
|
77
77
|
Requires-Dist: fsspec>=2024.12.0; extra == "hf"
|
|
78
78
|
Requires-Dist: torch<2.9.0; extra == "hf"
|
|
79
|
+
Requires-Dist: torchcodec<0.8.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
|
|
79
80
|
Provides-Extra: video
|
|
80
81
|
Requires-Dist: ffmpeg-python; extra == "video"
|
|
81
82
|
Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
|
|
@@ -54,10 +54,10 @@ datachain/client/s3.py,sha256=KS9o0jxXJRFp7Isdibz366VaWrULmpegzfYdurJpAl0,7499
|
|
|
54
54
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
55
55
|
datachain/data_storage/db_engine.py,sha256=MGbrckXk5kHOfpjnhHhGpyJpAsgaBCxMmfd33hB2SWI,3756
|
|
56
56
|
datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
|
|
57
|
-
datachain/data_storage/metastore.py,sha256=
|
|
57
|
+
datachain/data_storage/metastore.py,sha256=lwlP-4TRSglC_a8lw1dUM8HRvW3kmDh51uaEj1nmKqI,64082
|
|
58
58
|
datachain/data_storage/schema.py,sha256=3fAgiE11TIDYCW7EbTdiOm61SErRitvsLr7YPnUlVm0,9801
|
|
59
59
|
datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
|
|
60
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
60
|
+
datachain/data_storage/sqlite.py,sha256=pee99RewNQh5kVxGpD2sf9V5VloM4xwn8oeEhquU1rs,31756
|
|
61
61
|
datachain/data_storage/warehouse.py,sha256=nuGT27visvAi7jr7ZAZF-wmFe0ZEFD8qaTheINX_7RM,35269
|
|
62
62
|
datachain/diff/__init__.py,sha256=Fo3xMnctKyA0YtvnsBXQ-P5gQeeEwed17Tn_i7vfLKs,9332
|
|
63
63
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -165,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
165
165
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
166
166
|
datachain/toolkit/split.py,sha256=xQzzmvQRKsPteDKbpgOxd4r971BnFaK33mcOl0FuGeI,2883
|
|
167
167
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
168
|
-
datachain-0.36.
|
|
169
|
-
datachain-0.36.
|
|
170
|
-
datachain-0.36.
|
|
171
|
-
datachain-0.36.
|
|
172
|
-
datachain-0.36.
|
|
173
|
-
datachain-0.36.
|
|
168
|
+
datachain-0.36.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
169
|
+
datachain-0.36.3.dist-info/METADATA,sha256=pgqrjfNRW1OakoiLu5NnW6qUtSIISwHAMPJB6UGNFkA,13762
|
|
170
|
+
datachain-0.36.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
171
|
+
datachain-0.36.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
172
|
+
datachain-0.36.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
173
|
+
datachain-0.36.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|