datachain 0.36.1__py3-none-any.whl → 0.36.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -56,13 +56,15 @@ from datachain.project import Project
56
56
  from datachain.utils import JSONSerialize
57
57
 
58
58
  if TYPE_CHECKING:
59
- from sqlalchemy import Delete, Insert, Select, Update
59
+ from sqlalchemy import CTE, Delete, Insert, Select, Subquery, Update
60
60
  from sqlalchemy.schema import SchemaItem
61
+ from sqlalchemy.sql.elements import ColumnElement
61
62
 
62
63
  from datachain.data_storage import schema
63
64
  from datachain.data_storage.db_engine import DatabaseEngine
64
65
 
65
66
  logger = logging.getLogger("datachain")
67
+ DEPTH_LIMIT_DEFAULT = 100
66
68
 
67
69
 
68
70
  class AbstractMetastore(ABC, Serializable):
@@ -1463,6 +1465,18 @@ class AbstractDBMetastore(AbstractMetastore):
1463
1465
  Returns a list of columns to select in a query for fetching dataset dependencies
1464
1466
  """
1465
1467
 
1468
+ @abstractmethod
1469
+ def _dataset_dependency_nodes_select_columns(
1470
+ self,
1471
+ namespaces_subquery: "Subquery",
1472
+ dependency_tree_cte: "CTE",
1473
+ datasets_subquery: "Subquery",
1474
+ ) -> list["ColumnElement"]:
1475
+ """
1476
+ Returns a list of columns to select in a query for fetching
1477
+ dataset dependency nodes.
1478
+ """
1479
+
1466
1480
  def get_direct_dataset_dependencies(
1467
1481
  self, dataset: DatasetRecord, version: str
1468
1482
  ) -> list[DatasetDependency | None]:
@@ -1493,7 +1507,7 @@ class AbstractDBMetastore(AbstractMetastore):
1493
1507
  return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
1494
1508
 
1495
1509
  def get_dataset_dependency_nodes(
1496
- self, dataset_id: int, version_id: int
1510
+ self, dataset_id: int, version_id: int, depth_limit: int = DEPTH_LIMIT_DEFAULT
1497
1511
  ) -> list[DatasetDependencyNode | None]:
1498
1512
  n = self._namespaces_select().subquery()
1499
1513
  p = self._projects
@@ -1522,33 +1536,31 @@ class AbstractDBMetastore(AbstractMetastore):
1522
1536
  cte = base_query.cte(name="dependency_tree", recursive=True)
1523
1537
 
1524
1538
  # Recursive case: dependencies of dependencies
1525
- recursive_query = select(
1526
- *dep_fields,
1527
- (cte.c.depth + 1).label("depth"),
1528
- ).select_from(
1529
- cte.join(
1530
- dd,
1531
- (cte.c.dataset_id == dd.c.source_dataset_id)
1532
- & (cte.c.dataset_version_id == dd.c.source_dataset_version_id),
1539
+ # Limit depth to 100 to prevent infinite loops in case of circular dependencies
1540
+ recursive_query = (
1541
+ select(
1542
+ *dep_fields,
1543
+ (cte.c.depth + 1).label("depth"),
1533
1544
  )
1545
+ .select_from(
1546
+ cte.join(
1547
+ dd,
1548
+ (cte.c.dataset_id == dd.c.source_dataset_id)
1549
+ & (cte.c.dataset_version_id == dd.c.source_dataset_version_id),
1550
+ )
1551
+ )
1552
+ .where(cte.c.depth < depth_limit)
1534
1553
  )
1535
1554
 
1536
1555
  cte = cte.union(recursive_query)
1537
1556
 
1538
1557
  # Fetch all with full details
1539
- final_query = select(
1540
- n.c.name,
1541
- p.c.name,
1542
- cte.c.id,
1543
- cte.c.dataset_id,
1544
- cte.c.dataset_version_id,
1545
- d.c.name,
1546
- dv.c.version,
1547
- dv.c.created_at,
1548
- cte.c.source_dataset_id,
1549
- cte.c.source_dataset_version_id,
1550
- cte.c.depth,
1551
- ).select_from(
1558
+ select_cols = self._dataset_dependency_nodes_select_columns(
1559
+ namespaces_subquery=n,
1560
+ dependency_tree_cte=cte,
1561
+ datasets_subquery=d,
1562
+ )
1563
+ final_query = self._datasets_dependencies_select(*select_cols).select_from(
1552
1564
  # Use outer joins to handle cases where dependent datasets have been
1553
1565
  # physically deleted. This allows us to return dependency records with
1554
1566
  # None values instead of silently omitting them, making broken
@@ -20,7 +20,10 @@ from sqlalchemy import (
20
20
  from sqlalchemy.dialects import sqlite
21
21
  from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
22
22
  from sqlalchemy.sql import func
23
- from sqlalchemy.sql.elements import BinaryExpression, BooleanClauseList
23
+ from sqlalchemy.sql.elements import (
24
+ BinaryExpression,
25
+ BooleanClauseList,
26
+ )
24
27
  from sqlalchemy.sql.expression import bindparam, cast
25
28
  from sqlalchemy.sql.selectable import Select
26
29
  from tqdm.auto import tqdm
@@ -41,6 +44,7 @@ from datachain.sql.types import SQLType
41
44
  from datachain.utils import DataChainDir, batched, batched_it
42
45
 
43
46
  if TYPE_CHECKING:
47
+ from sqlalchemy import CTE, Subquery
44
48
  from sqlalchemy.dialects.sqlite import Insert
45
49
  from sqlalchemy.engine.base import Engine
46
50
  from sqlalchemy.schema import SchemaItem
@@ -539,6 +543,26 @@ class SQLiteMetastore(AbstractDBMetastore):
539
543
  self._datasets_versions.c.created_at,
540
544
  ]
541
545
 
546
+ def _dataset_dependency_nodes_select_columns(
547
+ self,
548
+ namespaces_subquery: "Subquery",
549
+ dependency_tree_cte: "CTE",
550
+ datasets_subquery: "Subquery",
551
+ ) -> list["ColumnElement"]:
552
+ return [
553
+ namespaces_subquery.c.name,
554
+ self._projects.c.name,
555
+ dependency_tree_cte.c.id,
556
+ dependency_tree_cte.c.dataset_id,
557
+ dependency_tree_cte.c.dataset_version_id,
558
+ datasets_subquery.c.name,
559
+ self._datasets_versions.c.version,
560
+ self._datasets_versions.c.created_at,
561
+ dependency_tree_cte.c.source_dataset_id,
562
+ dependency_tree_cte.c.source_dataset_version_id,
563
+ dependency_tree_cte.c.depth,
564
+ ]
565
+
542
566
  #
543
567
  # Jobs
544
568
  #
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.36.1
3
+ Version: 0.36.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -54,10 +54,10 @@ datachain/client/s3.py,sha256=KS9o0jxXJRFp7Isdibz366VaWrULmpegzfYdurJpAl0,7499
54
54
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
55
55
  datachain/data_storage/db_engine.py,sha256=MGbrckXk5kHOfpjnhHhGpyJpAsgaBCxMmfd33hB2SWI,3756
56
56
  datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
57
- datachain/data_storage/metastore.py,sha256=NLGYLErWFUNXjKbEoESFkKW222MQdMCBlpuqaYVugsE,63484
57
+ datachain/data_storage/metastore.py,sha256=lwlP-4TRSglC_a8lw1dUM8HRvW3kmDh51uaEj1nmKqI,64082
58
58
  datachain/data_storage/schema.py,sha256=3fAgiE11TIDYCW7EbTdiOm61SErRitvsLr7YPnUlVm0,9801
59
59
  datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
60
- datachain/data_storage/sqlite.py,sha256=MgQ6bfJ7LGW91UiVHQtSkj_5HalRi1aeHCEW__5JEe8,30959
60
+ datachain/data_storage/sqlite.py,sha256=pee99RewNQh5kVxGpD2sf9V5VloM4xwn8oeEhquU1rs,31756
61
61
  datachain/data_storage/warehouse.py,sha256=nuGT27visvAi7jr7ZAZF-wmFe0ZEFD8qaTheINX_7RM,35269
62
62
  datachain/diff/__init__.py,sha256=Fo3xMnctKyA0YtvnsBXQ-P5gQeeEwed17Tn_i7vfLKs,9332
63
63
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -165,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
165
165
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
166
166
  datachain/toolkit/split.py,sha256=xQzzmvQRKsPteDKbpgOxd4r971BnFaK33mcOl0FuGeI,2883
167
167
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
168
- datachain-0.36.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
- datachain-0.36.1.dist-info/METADATA,sha256=BBaBx1Ail7RzpUlvEywlXKZtl_6Vn-KIEjm8OJdXrng,13657
170
- datachain-0.36.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
- datachain-0.36.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
- datachain-0.36.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
- datachain-0.36.1.dist-info/RECORD,,
168
+ datachain-0.36.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
+ datachain-0.36.2.dist-info/METADATA,sha256=yuMCOoIfCTY3nwDSkXV8W63z3IwwCpgcA1jIrE3ba0s,13657
170
+ datachain-0.36.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
+ datachain-0.36.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
+ datachain-0.36.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
+ datachain-0.36.2.dist-info/RECORD,,