PyPI - deriva-ml - Versions diffs - 1.17.14__py3-none-any.whl → 1.17.16__py3-none-any.whl - Mend

deriva-ml 1.17.14py3-none-any.whl → 1.17.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

deriva_ml/__init__.py +2 -2
deriva_ml/asset/asset.py +0 -4
deriva_ml/catalog/__init__.py +6 -0
deriva_ml/catalog/clone.py +1591 -38
deriva_ml/catalog/localize.py +66 -29
deriva_ml/core/base.py +12 -9
deriva_ml/core/definitions.py +13 -12
deriva_ml/core/ermrest.py +11 -12
deriva_ml/core/mixins/annotation.py +2 -2
deriva_ml/core/mixins/asset.py +3 -3
deriva_ml/core/mixins/dataset.py +3 -3
deriva_ml/core/mixins/execution.py +1 -0
deriva_ml/core/mixins/feature.py +2 -2
deriva_ml/core/mixins/file.py +2 -2
deriva_ml/core/mixins/path_builder.py +2 -2
deriva_ml/core/mixins/rid_resolution.py +2 -2
deriva_ml/core/mixins/vocabulary.py +2 -2
deriva_ml/core/mixins/workflow.py +3 -3
deriva_ml/dataset/catalog_graph.py +3 -4
deriva_ml/dataset/dataset.py +5 -3
deriva_ml/dataset/dataset_bag.py +0 -2
deriva_ml/dataset/upload.py +2 -2
deriva_ml/demo_catalog.py +0 -1
deriva_ml/execution/__init__.py +8 -8
deriva_ml/execution/base_config.py +2 -2
deriva_ml/execution/execution.py +5 -3
deriva_ml/execution/execution_record.py +0 -1
deriva_ml/execution/model_protocol.py +1 -1
deriva_ml/execution/multirun_config.py +0 -1
deriva_ml/execution/runner.py +3 -3
deriva_ml/experiment/experiment.py +3 -3
deriva_ml/feature.py +2 -2
deriva_ml/interfaces.py +2 -2
deriva_ml/model/__init__.py +45 -24
deriva_ml/model/annotations.py +0 -1
deriva_ml/model/catalog.py +3 -2
deriva_ml/model/data_loader.py +330 -0
deriva_ml/model/data_sources.py +439 -0
deriva_ml/model/database.py +216 -32
deriva_ml/model/fk_orderer.py +379 -0
deriva_ml/model/handles.py +1 -1
deriva_ml/model/schema_builder.py +816 -0
deriva_ml/run_model.py +3 -3
deriva_ml/schema/annotations.py +2 -1
deriva_ml/schema/create_schema.py +1 -1
deriva_ml/schema/validation.py +1 -1
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/METADATA +1 -1
deriva_ml-1.17.16.dist-info/RECORD +81 -0
deriva_ml-1.17.14.dist-info/RECORD +0 -77
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/WHEEL +0 -0
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/top_level.txt +0 -0

deriva_ml/catalog/clone.py CHANGED Viewed

@@ -20,7 +20,7 @@ from __future__ import annotations
 import json
 import logging
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from datetime import datetime, timezone
 from enum import Enum
 from typing import Any
@@ -29,6 +29,9 @@ from urllib.parse import quote as urlquote
 from deriva.core import DerivaServer, ErmrestCatalog, get_credential
 from deriva.core.hatrac_store import HatracStore
+from deriva_ml.model.catalog import VOCAB_COLUMNS
+from deriva_ml.schema import create_ml_schema
 logger = logging.getLogger("deriva_ml")
@@ -87,9 +90,10 @@ class CloneIssue:
     details: str | None = None
     action: str | None = None
     row_count: int = 0
+    skipped_rids: list[str] | None = None  # RIDs of rows that were skipped
     def to_dict(self) -> dict[str, Any]:
-        return {
+        result = {
             "severity": self.severity.value,
             "category": self.category.value,
             "message": self.message,
@@ -98,6 +102,9 @@ class CloneIssue:
             "action": self.action,
             "row_count": self.row_count,
         }
+        if self.skipped_rids:
+            result["skipped_rids"] = self.skipped_rids
+        return result
     def __str__(self) -> str:
         parts = [f"[{self.severity.value.upper()}]"]
@@ -106,7 +113,32 @@ class CloneIssue:
         parts.append(self.message)
         if self.row_count > 0:
             parts.append(f"({self.row_count} rows)")
-        return " ".join(parts)
+        result = " ".join(parts)
+        if self.skipped_rids:
+            # For small numbers, list the RIDs; for large numbers, just show count
+            if len(self.skipped_rids) <= 5:
+                result += f"\n    Skipped RIDs: {', '.join(self.skipped_rids)}"
+            else:
+                result += f"\n    Skipped RIDs: {len(self.skipped_rids)} rows (see JSON for full list)"
+        return result
+@dataclass
+class CloneReportSummary:
+    """Summary statistics for a clone operation."""
+    total_issues: int
+    errors: int
+    warnings: int
+    tables_restored: int
+    tables_failed: int
+    tables_skipped: int
+    total_rows_restored: int
+    orphan_rows_removed: int
+    orphan_rows_nullified: int
+    fkeys_applied: int
+    fkeys_failed: int
+    fkeys_pruned: int
 @dataclass
@@ -134,27 +166,32 @@ class CloneReport:
     def add_issue(self, issue: CloneIssue) -> None:
         self.issues.append(issue)
+    @property
+    def summary(self) -> CloneReportSummary:
+        """Return summary statistics as a dataclass."""
+        return CloneReportSummary(
+            total_issues=len(self.issues),
+            errors=len([i for i in self.issues if i.severity == CloneIssueSeverity.ERROR]),
+            warnings=len([i for i in self.issues if i.severity == CloneIssueSeverity.WARNING]),
+            tables_restored=len(self.tables_restored),
+            tables_failed=len(self.tables_failed),
+            tables_skipped=len(self.tables_skipped),
+            total_rows_restored=sum(self.tables_restored.values()),
+            orphan_rows_removed=sum(
+                d.get("rows_removed", 0) for d in self.orphan_details.values()
+            ),
+            orphan_rows_nullified=sum(
+                d.get("rows_nullified", 0) for d in self.orphan_details.values()
+            ),
+            fkeys_applied=self.fkeys_applied,
+            fkeys_failed=self.fkeys_failed,
+            fkeys_pruned=self.fkeys_pruned,
+        )
     def to_dict(self) -> dict[str, Any]:
         """Return the report as a JSON-serializable dictionary."""
         return {
-            "summary": {
-                "total_issues": len(self.issues),
-                "errors": len([i for i in self.issues if i.severity == CloneIssueSeverity.ERROR]),
-                "warnings": len([i for i in self.issues if i.severity == CloneIssueSeverity.WARNING]),
-                "tables_restored": len(self.tables_restored),
-                "tables_failed": len(self.tables_failed),
-                "tables_skipped": len(self.tables_skipped),
-                "total_rows_restored": sum(self.tables_restored.values()),
-                "orphan_rows_removed": sum(
-                    d.get("rows_removed", 0) for d in self.orphan_details.values()
-                ),
-                "orphan_rows_nullified": sum(
-                    d.get("rows_nullified", 0) for d in self.orphan_details.values()
-                ),
-                "fkeys_applied": self.fkeys_applied,
-                "fkeys_failed": self.fkeys_failed,
-                "fkeys_pruned": self.fkeys_pruned,
-            },
+            "summary": asdict(self.summary),
             "issues": [i.to_dict() for i in self.issues],
             "tables_restored": self.tables_restored,
             "tables_failed": self.tables_failed,
@@ -332,6 +369,7 @@ class CloneDetails:
     source_catalog_id: str
     source_snapshot: str | None = None
     source_schema_url: str | None = None  # Hatrac URL to source schema JSON
+    # Clone parameters
     orphan_strategy: str = "fail"
     truncate_oversized: bool = False
     prune_hidden_fkeys: bool = False
@@ -339,15 +377,21 @@ class CloneDetails:
     asset_mode: str = "refs"
     exclude_schemas: list[str] = field(default_factory=list)
     exclude_objects: list[str] = field(default_factory=list)
+    add_ml_schema: bool = False
+    copy_annotations: bool = True
+    copy_policy: bool = True
+    reinitialize_dataset_versions: bool = True
+    # Statistics
     rows_copied: int = 0
     rows_skipped: int = 0
+    skipped_rids: list[str] = field(default_factory=list)  # RIDs of skipped rows
     truncated_count: int = 0
     orphan_rows_removed: int = 0
     orphan_rows_nullified: int = 0
     fkeys_pruned: int = 0
     def to_dict(self) -> dict[str, Any]:
-        return {
+        result = {
             "source_hostname": self.source_hostname,
             "source_catalog_id": self.source_catalog_id,
             "source_snapshot": self.source_snapshot,
@@ -359,6 +403,10 @@ class CloneDetails:
             "asset_mode": self.asset_mode,
             "exclude_schemas": self.exclude_schemas,
             "exclude_objects": self.exclude_objects,
+            "add_ml_schema": self.add_ml_schema,
+            "copy_annotations": self.copy_annotations,
+            "copy_policy": self.copy_policy,
+            "reinitialize_dataset_versions": self.reinitialize_dataset_versions,
             "rows_copied": self.rows_copied,
             "rows_skipped": self.rows_skipped,
             "truncated_count": self.truncated_count,
@@ -366,6 +414,9 @@ class CloneDetails:
             "orphan_rows_nullified": self.orphan_rows_nullified,
             "fkeys_pruned": self.fkeys_pruned,
         }
+        if self.skipped_rids:
+            result["skipped_rids"] = self.skipped_rids
+        return result
     @classmethod
     def from_dict(cls, data: dict[str, Any]) -> "CloneDetails":
@@ -381,8 +432,13 @@ class CloneDetails:
             asset_mode=data.get("asset_mode", "refs"),
             exclude_schemas=data.get("exclude_schemas", []),
             exclude_objects=data.get("exclude_objects", []),
+            add_ml_schema=data.get("add_ml_schema", False),
+            copy_annotations=data.get("copy_annotations", True),
+            copy_policy=data.get("copy_policy", True),
+            reinitialize_dataset_versions=data.get("reinitialize_dataset_versions", True),
             rows_copied=data.get("rows_copied", 0),
             rows_skipped=data.get("rows_skipped", 0),
+            skipped_rids=data.get("skipped_rids", []),
             truncated_count=data.get("truncated_count", 0),
             orphan_rows_removed=data.get("orphan_rows_removed", 0),
             orphan_rows_nullified=data.get("orphan_rows_nullified", 0),
@@ -677,7 +733,7 @@ def _copy_table_data_with_retry(
     report: "CloneReport",
     deferred_indexes: dict[str, list[dict]],
     truncate_oversized: bool = False,
-) -> tuple[int, int, list[TruncatedValue]]:
+) -> tuple[int, int, list[str], list[TruncatedValue]]:
     """Copy data for a single table with retry logic for index errors.
     If a btree index size error occurs, this function will:
@@ -698,7 +754,7 @@ def _copy_table_data_with_retry(
         truncate_oversized: If True, truncate oversized values instead of skipping rows.
     Returns:
-        Tuple of (rows_copied, rows_skipped, truncated_values).
+        Tuple of (rows_copied, rows_skipped, skipped_rids, truncated_values).
         rows_copied is -1 if the copy failed entirely.
     """
     tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
@@ -711,6 +767,7 @@ def _copy_table_data_with_retry(
     last = None
     table_rows = 0
     rows_skipped = 0
+    skipped_rids: list[str] = []  # Track RIDs of skipped rows
     truncated_values: list[TruncatedValue] = []
     row_by_row_mode = False
     problematic_index = None
@@ -768,7 +825,7 @@ def _copy_table_data_with_retry(
             ).json()
         except Exception as e:
             logger.warning(f"Failed to read from {sname}:{tname}: {e}")
-            return -1, rows_skipped, truncated_values
+            return -1, rows_skipped, skipped_rids, truncated_values
         if not page:
             break
@@ -809,11 +866,14 @@ def _copy_table_data_with_retry(
                         rows_skipped += 1
                         rid = row.get('RID', 'unknown')
+                        skipped_rids.append(rid)
                         logger.debug(f"Skipping row {rid} in {table_key} due to index size limit")
                     else:
                         # Different error - log and skip
                         rows_skipped += 1
-                        logger.debug(f"Skipping row in {table_key}: {row_error}")
+                        rid = row.get('RID', 'unknown')
+                        skipped_rids.append(rid)
+                        logger.debug(f"Skipping row {rid} in {table_key}: {row_error}")
             last = page[-1]['RID']
         else:
             # Normal batch mode
@@ -884,14 +944,17 @@ def _copy_table_data_with_retry(
                                 rows_skipped += 1
                                 rid = row.get('RID', 'unknown')
+                                skipped_rids.append(rid)
                                 logger.debug(f"Skipping row {rid} due to index size limit")
                             else:
                                 rows_skipped += 1
-                                logger.debug(f"Skipping row: {row_error}")
+                                rid = row.get('RID', 'unknown')
+                                skipped_rids.append(rid)
+                                logger.debug(f"Skipping row {rid}: {row_error}")
                     last = page[-1]['RID']
                 else:
                     logger.warning(f"Failed to write to {sname}:{tname}: {e}")
-                    return -1, rows_skipped, truncated_values
+                    return -1, rows_skipped, skipped_rids, truncated_values
     # Report skipped rows
     if rows_skipped > 0:
@@ -903,8 +966,9 @@ def _copy_table_data_with_retry(
             details=f"Index '{problematic_index}' on column '{problematic_column}'",
             action="These rows have values too large for btree index (>2704 bytes)",
             row_count=rows_skipped,
+            skipped_rids=skipped_rids if skipped_rids else None,
         ))
-        logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits")
+        logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits: RIDs={skipped_rids}")
     # Report truncated values
     if truncated_values:
@@ -919,7 +983,7 @@ def _copy_table_data_with_retry(
         ))
         logger.info(f"Truncated {len(truncated_values)} values in {table_key}")
-    return table_rows, rows_skipped, truncated_values
+    return table_rows, rows_skipped, skipped_rids, truncated_values
@@ -946,6 +1010,925 @@ def _rebuild_deferred_indexes(
     logger.info(f"Reporting {sum(len(v) for v in deferred_indexes.values())} index issues...")
+# =============================================================================
+# Subset Clone Helpers
+# =============================================================================
+# Export annotation tag
+_export_tag = "tag:isrd.isi.edu,2019:export"
+def _parse_export_annotation_tables(
+    table: Any,
+    paths_discovered: list[list[str]] | None = None,
+) -> tuple[list[str], list[list[str]]]:
+    """Parse export annotation from a table to extract tables and paths.
+    The export annotation (tag:isrd.isi.edu,2019:export) defines which tables
+    should be exported when downloading a row as a BDBag. This function extracts
+    the table names from the annotation paths.
+    Args:
+        table: ERMrest Table object with annotations.
+        paths_discovered: Optional list to append discovered paths to (for reuse).
+    Returns:
+        Tuple of (tables_list, paths_list) where:
+        - tables_list: List of table names in "schema:table" format
+        - paths_list: List of paths, each path is a list of "schema:table" strings
+    """
+    if paths_discovered is None:
+        paths_discovered = []
+    tables: set[str] = set()
+    # Add the root table itself
+    root_table_spec = f"{table.schema.name}:{table.name}"
+    tables.add(root_table_spec)
+    # Get the export annotation
+    export_annotation = table.annotations.get(_export_tag, {})
+    # Export annotations can have multiple contexts (*, detailed, etc.)
+    # We'll look at all of them
+    for context_key, context_value in export_annotation.items():
+        templates = context_value.get("templates", [])
+        for template in templates:
+            outputs = template.get("outputs", [])
+            for output in outputs:
+                source = output.get("source", {})
+                path_str = source.get("path", "")
+                if not path_str:
+                    continue
+                # Parse the path - it's in ERMrest format like "schema:table/schema:table2/..."
+                # Split by "/" and parse each segment
+                path_segments = path_str.split("/")
+                current_path: list[str] = [root_table_spec]
+                for segment in path_segments:
+                    # Skip empty segments
+                    if not segment:
+                        continue
+                    # Skip attribute projections (contain ":" followed by "=")
+                    if "=" in segment:
+                        continue
+                    # Parse schema:table format
+                    if ":" in segment:
+                        # Could be "schema:table" or complex path syntax
+                        # For simple schema:table, just add it
+                        parts = segment.split(":")
+                        if len(parts) == 2 and not any(c in segment for c in ["(", ")", "!", "@"]):
+                            schema, tname = parts
+                            table_spec = f"{schema}:{tname}"
+                            tables.add(table_spec)
+                            current_path.append(table_spec)
+                if len(current_path) > 1:
+                    paths_discovered.append(current_path)
+    return sorted(tables), paths_discovered
+def _compute_reachable_rids_from_paths(
+    catalog: ErmrestCatalog,
+    root_rid: str,
+    root_table: str,
+    paths: list[list[str]],
+    include_tables: list[str],
+    model: Any | None = None,
+) -> dict[str, set[str]]:
+    """Compute RIDs reachable from root_rid using predefined paths.
+    This is more efficient than FK graph traversal because it uses the paths
+    defined in the export annotation, which are already known to work.
+    After following the paths, also discovers FK references from reachable rows
+    back to tables in the include list. This ensures FK integrity by including
+    referenced rows that weren't found via the export paths.
+    Args:
+        catalog: Source catalog connection.
+        root_rid: Starting RID.
+        root_table: Root table in "schema:table" format.
+        paths: List of paths from export annotation, each path is a list of
+            "schema:table" strings starting with the root table.
+        include_tables: All tables to track reachability for.
+        model: Optional ERMrest Model for FK relationship discovery.
+    Returns:
+        Dict mapping "schema:table" -> set of reachable RIDs.
+    """
+    # Initialize reachable sets for all tables
+    reachable: dict[str, set[str]] = {t: set() for t in include_tables}
+    reachable[root_table].add(root_rid)
+    # Query each path from the export annotation
+    for path in paths:
+        if len(path) < 2:
+            continue
+        # Build ERMrest query following the path
+        # Start with the root table and RID filter
+        query = f"/entity/{_quote_table_spec(root_table)}/RID={urlquote(root_rid)}"
+        # Add each step in the path (skip the root table)
+        for table_spec in path[1:]:
+            query += f"/{_quote_table_spec(table_spec)}"
+        # Query for rows at the end of the path
+        target_table = path[-1]
+        if target_table not in reachable:
+            continue
+        try:
+            result = catalog.get(query).json()
+            for row in result:
+                if "RID" in row:
+                    reachable[target_table].add(row["RID"])
+            if result:
+                logger.debug(f"Path {' -> '.join(path)}: found {len(result)} rows")
+        except Exception as e:
+            logger.debug(f"Path query failed: {query}: {e}")
+            continue
+    # Note: FK reference expansion was too slow for large datasets and is disabled.
+    # Instead, rely on orphan_strategy (DELETE/NULLIFY) to handle any FK violations
+    # that occur when referenced rows weren't found via the export paths.
+    return reachable
+def _expand_reachable_via_fk_references(
+    catalog: ErmrestCatalog,
+    reachable: dict[str, set[str]],
+    include_tables: list[str],
+    model: Any,
+) -> None:
+    """Expand reachable RIDs by following FK references.
+    For each table with reachable rows, find FK columns that reference other
+    included tables and add the referenced RIDs to the reachable set.
+    Args:
+        catalog: Source catalog connection.
+        reachable: Dict mapping "schema:table" -> set of RIDs (modified in place).
+        include_tables: Tables to include.
+        model: ERMrest Model object.
+    """
+    # Build table lookup
+    table_lookup: dict[tuple[str, str], str] = {}
+    for table_spec in include_tables:
+        schema, table_name = table_spec.split(":", 1)
+        table_lookup[(schema, table_name)] = table_spec
+    # Iterate until no new RIDs are discovered
+    max_iterations = 10  # Prevent infinite loops
+    iteration = 0
+    while iteration < max_iterations:
+        iteration += 1
+        new_rids_found = False
+        for table_spec in include_tables:
+            current_rids = reachable.get(table_spec, set())
+            if not current_rids:
+                continue
+            schema, table_name = table_spec.split(":", 1)
+            try:
+                table = model.schemas[schema].tables[table_name]
+            except KeyError:
+                continue
+            # Check each FK for references to other included tables
+            for fk in table.foreign_keys:
+                pk_table = fk.pk_table
+                pk_key = (pk_table.schema.name, pk_table.name)
+                pk_spec = table_lookup.get(pk_key)
+                if not pk_spec:
+                    continue  # Target table not in our include list
+                # Get the FK column name
+                if not fk.foreign_key_columns:
+                    continue
+                fk_col = fk.foreign_key_columns[0].name
+                # Query for FK values from reachable rows
+                # Do this in batches to avoid URL length limits
+                # Ensure all RIDs are strings
+                rids_list = [str(r) for r in current_rids if r is not None]
+                batch_size = 100
+                for i in range(0, len(rids_list), batch_size):
+                    batch = rids_list[i:i + batch_size]
+                    rid_filter = ",".join(urlquote(r) for r in batch)
+                    try:
+                        # Get distinct FK values
+                        query = f"/attributegroup/{_quote_table_spec(table_spec)}/RID=any({rid_filter})/{urlquote(fk_col)}"
+                        result = catalog.get(query).json()
+                        for row in result:
+                            fk_value = row.get(fk_col)
+                            if fk_value is not None:
+                                # Ensure FK value is a string
+                                fk_value_str = str(fk_value)
+                                if fk_value_str not in reachable[pk_spec]:
+                                    reachable[pk_spec].add(fk_value_str)
+                                    new_rids_found = True
+                    except Exception as e:
+                        logger.debug(f"FK reference query failed: {e}")
+                        continue
+        if not new_rids_found:
+            break
+    if iteration > 1:
+        logger.debug(f"FK reference expansion completed in {iteration} iterations")
+def _expand_tables_with_associations(
+    model: Any,
+    include_tables: list[str],
+) -> tuple[list[str], list[str]]:
+    """Expand table list to include association tables needed for FK integrity.
+    Given a list of tables, finds all association tables that connect pairs
+    of included tables and adds them to the list.
+    Args:
+        model: ERMrest Model object.
+        include_tables: List of table names in "schema:table" format.
+    Returns:
+        Tuple of (all_tables, association_tables_added) where:
+        - all_tables: Original tables plus added association tables
+        - association_tables_added: Just the association tables that were added
+    """
+    # Parse table names to (schema, table) tuples
+    included_set: set[tuple[str, str]] = set()
+    for table_spec in include_tables:
+        if ":" in table_spec:
+            schema, table = table_spec.split(":", 1)
+            included_set.add((schema, table))
+        else:
+            raise ValueError(f"Table must be specified as 'schema:table', got: {table_spec}")
+    # Find association tables connecting included tables
+    associations_added: list[str] = []
+    for schema_name, table_name in list(included_set):
+        try:
+            table = model.schemas[schema_name].tables[table_name]
+        except KeyError:
+            continue
+        # Check for associations from this table
+        for assoc in table.find_associations(pure=False):
+            assoc_table = assoc.table
+            assoc_key = (assoc_table.schema.name, assoc_table.name)
+            # Already included
+            if assoc_key in included_set:
+                continue
+            # Check if the other end of the association is in our included set
+            for other_fk in assoc.other_fkeys:
+                other_table = other_fk.pk_table
+                other_key = (other_table.schema.name, other_table.name)
+                if other_key in included_set:
+                    # This association connects two included tables
+                    included_set.add(assoc_key)
+                    assoc_spec = f"{assoc_key[0]}:{assoc_key[1]}"
+                    if assoc_spec not in associations_added:
+                        associations_added.append(assoc_spec)
+                    break
+    all_tables = list(include_tables) + associations_added
+    return all_tables, associations_added
+def _expand_tables_with_vocabularies(
+    model: Any,
+    include_tables: list[str],
+) -> tuple[list[str], list[str]]:
+    """Expand table list to include vocabulary tables referenced by included tables.
+    Examines FK targets of included tables and adds any that are vocabulary tables.
+    Args:
+        model: ERMrest Model object.
+        include_tables: List of table names in "schema:table" format.
+    Returns:
+        Tuple of (all_tables, vocabulary_tables_added) where:
+        - all_tables: Original tables plus added vocabulary tables
+        - vocabulary_tables_added: Just the vocabulary tables that were added
+    """
+    def is_vocabulary(table) -> bool:
+        return VOCAB_COLUMNS.issubset({c.name.upper() for c in table.columns})
+    # Parse table names
+    included_set: set[tuple[str, str]] = set()
+    for table_spec in include_tables:
+        if ":" in table_spec:
+            schema, table = table_spec.split(":", 1)
+            included_set.add((schema, table))
+    vocabularies_added: list[str] = []
+    for schema_name, table_name in list(included_set):
+        try:
+            table = model.schemas[schema_name].tables[table_name]
+        except KeyError:
+            continue
+        # Check FK targets for vocabulary tables
+        for fk in table.foreign_keys:
+            pk_table = fk.pk_table
+            pk_key = (pk_table.schema.name, pk_table.name)
+            if pk_key in included_set:
+                continue
+            if is_vocabulary(pk_table):
+                included_set.add(pk_key)
+                vocab_spec = f"{pk_key[0]}:{pk_key[1]}"
+                if vocab_spec not in vocabularies_added:
+                    vocabularies_added.append(vocab_spec)
+    all_tables = list(include_tables) + vocabularies_added
+    return all_tables, vocabularies_added
+def _quote_table_spec(table_spec: str) -> str:
+    """URL-quote a table specification for ERMrest queries.
+    ERMrest uses schema:table format where the colon must NOT be encoded.
+    This function quotes the schema and table names separately.
+    Args:
+        table_spec: Table specification in "schema:table" format.
+    Returns:
+        URL-safe string with schema and table quoted but colon preserved.
+    """
+    schema, table = table_spec.split(":", 1)
+    return f"{urlquote(schema)}:{urlquote(table)}"
+def _discover_reachable_tables(
+    model: Any,
+    start_tables: list[str],
+    exclude_tables: set[tuple[str, str]] | None = None,
+    exclude_schemas: set[str] | None = None,
+) -> list[str]:
+    """Discover all tables reachable from start tables via FK relationships.
+    Traverses FK graph in both directions (outbound and inbound FKs) to find
+    all connected tables, excluding system schemas and specified exclusions.
+    Args:
+        model: ERMrest Model object.
+        start_tables: Starting tables in "schema:table" format.
+        exclude_tables: Set of (schema, table) tuples to exclude from discovery.
+        exclude_schemas: Set of schema names to exclude entirely.
+    Returns:
+        List of reachable table names in "schema:table" format.
+    """
+    exclude_tables = exclude_tables or set()
+    exclude_schemas = exclude_schemas or set()
+    # System schemas to always exclude
+    system_schemas = {"public", "_acl_admin", "WWW"}
+    all_excluded_schemas = system_schemas | exclude_schemas
+    # Parse start tables
+    discovered: set[tuple[str, str]] = set()
+    to_visit: list[tuple[str, str]] = []
+    for table_spec in start_tables:
+        if ":" not in table_spec:
+            raise ValueError(f"Table must be specified as 'schema:table', got: {table_spec}")
+        schema, table = table_spec.split(":", 1)
+        key = (schema, table)
+        if key not in exclude_tables and schema not in all_excluded_schemas:
+            discovered.add(key)
+            to_visit.append(key)
+    # BFS traversal of FK graph
+    while to_visit:
+        current_key = to_visit.pop(0)
+        schema_name, table_name = current_key
+        try:
+            table = model.schemas[schema_name].tables[table_name]
+        except KeyError:
+            continue
+        # Find connected tables via outbound FKs (this table references other tables)
+        for fk in table.foreign_keys:
+            pk_table = fk.pk_table
+            pk_key = (pk_table.schema.name, pk_table.name)
+            if pk_key in discovered or pk_key in exclude_tables:
+                continue
+            if pk_table.schema.name in all_excluded_schemas:
+                continue
+            discovered.add(pk_key)
+            to_visit.append(pk_key)
+        # Find connected tables via inbound FKs (other tables reference this table)
+        for fk in table.referenced_by:
+            ref_table = fk.table
+            ref_key = (ref_table.schema.name, ref_table.name)
+            if ref_key in discovered or ref_key in exclude_tables:
+                continue
+            if ref_table.schema.name in all_excluded_schemas:
+                continue
+            discovered.add(ref_key)
+            to_visit.append(ref_key)
+    # Convert to schema:table format
+    return [f"{schema}:{table}" for schema, table in sorted(discovered)]
+def _build_path_query(
+    root_table: str,
+    root_rid: str,
+    path: list[tuple[str, str]],
+) -> str:
+    """Build an ERMrest path query to traverse FK relationships.
+    Args:
+        root_table: Starting table in "schema:table" format.
+        root_rid: RID of the starting row.
+        path: List of (schema, table) tuples representing the path.
+    Returns:
+        ERMrest query string like "/entity/Schema:Table/RID=X/Schema:Next/..."
+    """
+    query = f"/entity/{_quote_table_spec(root_table)}/RID={urlquote(root_rid)}"
+    for schema, table in path:
+        query += f"/{urlquote(schema)}:{urlquote(table)}"
+    return query
+def _compute_reachable_rids(
+    catalog: ErmrestCatalog,
+    root_rid: str,
+    include_tables: list[str],
+    model: Any,
+) -> dict[str, set[str]]:
+    """Compute RIDs reachable from root_rid for each included table.
+    Uses FK graph traversal (both directions) to find all rows that are
+    connected to the root row through FK relationships.
+    Args:
+        catalog: Source catalog connection.
+        root_rid: Starting RID.
+        include_tables: Tables to compute reachability for ("schema:table" format).
+        model: ERMrest Model object.
+    Returns:
+        Dict mapping "schema:table" -> set of reachable RIDs.
+    """
+    # First, resolve the root RID to find its table
+    root_table_key = None
+    root_table = None
+    for table_spec in include_tables:
+        schema, table_name = table_spec.split(":", 1)
+        try:
+            uri = f"/entity/{_quote_table_spec(table_spec)}/RID={urlquote(root_rid)}"
+            result = catalog.get(uri).json()
+            if result:
+                root_table_key = table_spec
+                root_table = model.schemas[schema].tables[table_name]
+                break
+        except Exception:
+            continue
+    if root_table_key is None:
+        raise ValueError(f"Root RID {root_rid} not found in any of the included tables")
+    # Initialize reachable sets
+    reachable: dict[str, set[str]] = {t: set() for t in include_tables}
+    reachable[root_table_key].add(root_rid)
+    # Parse include_tables to lookup dict
+    table_lookup: dict[tuple[str, str], str] = {}
+    for table_spec in include_tables:
+        schema, table_name = table_spec.split(":", 1)
+        table_lookup[(schema, table_name)] = table_spec
+    # Build paths from root table using FK traversal (both directions)
+    def find_paths(
+        start_table: Any,
+        visited: set[tuple[str, str]],
+        current_path: list[tuple[str, str]],
+    ) -> list[list[tuple[str, str]]]:
+        """Recursively find all FK paths from start_table to included tables."""
+        paths = []
+        # Get all connected tables (both FK directions)
+        connected = []
+        # Outbound FKs (this table references other tables)
+        for fk in start_table.foreign_keys:
+            pk_table = fk.pk_table
+            pk_key = (pk_table.schema.name, pk_table.name)
+            if pk_key not in visited and pk_key in table_lookup:
+                connected.append(pk_table)
+        # Inbound FKs (other tables reference this table)
+        for fk in start_table.referenced_by:
+            ref_table = fk.table
+            ref_key = (ref_table.schema.name, ref_table.name)
+            if ref_key not in visited and ref_key in table_lookup:
+                connected.append(ref_table)
+        for next_table in connected:
+            next_key = (next_table.schema.name, next_table.name)
+            new_path = current_path + [next_key]
+            # This path reaches the target table
+            paths.append(new_path)
+            # Continue exploring from this table
+            new_visited = visited | {next_key}
+            paths.extend(find_paths(next_table, new_visited, new_path))
+        return paths
+    # Find all paths from root table
+    root_key = (root_table.schema.name, root_table.name)
+    all_paths = find_paths(root_table, {root_key}, [])
+    # For each path, query for reachable rows
+    for path in all_paths:
+        if not path:
+            continue
+        target_key = path[-1]
+        target_spec = table_lookup.get(target_key)
+        if not target_spec:
+            continue
+        # Build and execute the path query
+        query = _build_path_query(root_table_key, root_rid, path)
+        try:
+            result = catalog.get(query).json()
+            for row in result:
+                if "RID" in row:
+                    reachable[target_spec].add(row["RID"])
+        except Exception as e:
+            logger.debug(f"Path query failed: {query}: {e}")
+            continue
+    # Also need to check if reachable rows reference other reachable rows
+    # This handles transitive reachability through association tables
+    changed = True
+    while changed:
+        changed = False
+        for table_spec in include_tables:
+            schema, table_name = table_spec.split(":", 1)
+            try:
+                table = model.schemas[schema].tables[table_name]
+            except KeyError:
+                continue
+            current_rids = reachable[table_spec].copy()
+            # Check FKs from this table
+            for fk in table.foreign_keys:
+                pk_table = fk.pk_table
+                pk_spec = f"{pk_table.schema.name}:{pk_table.name}"
+                if pk_spec not in include_tables:
+                    continue
+                fk_col = fk.foreign_key_columns[0].name
+                # For each reachable row in this table, find the referenced row
+                for rid in current_rids:
+                    try:
+                        row = catalog.get(f"/entity/{_quote_table_spec(table_spec)}/RID={urlquote(rid)}").json()
+                        if row and fk_col in row[0] and row[0][fk_col]:
+                            ref_rid = row[0][fk_col]
+                            if ref_rid not in reachable[pk_spec]:
+                                reachable[pk_spec].add(ref_rid)
+                                changed = True
+                    except Exception:
+                        continue
+            # Check FKs to this table (inbound)
+            for fk in table.referenced_by:
+                ref_table = fk.table
+                ref_spec = f"{ref_table.schema.name}:{ref_table.name}"
+                if ref_spec not in include_tables:
+                    continue
+                fk_col = fk.foreign_key_columns[0].name
+                # For each reachable row in this table, find rows that reference it
+                for rid in current_rids:
+                    try:
+                        result = catalog.get(
+                            f"/entity/{_quote_table_spec(ref_spec)}/{urlquote(fk_col)}={urlquote(rid)}"
+                        ).json()
+                        for row in result:
+                            if "RID" in row and row["RID"] not in reachable[ref_spec]:
+                                reachable[ref_spec].add(row["RID"])
+                                changed = True
+                    except Exception:
+                        continue
+    return reachable
+def _copy_data_via_export_paths(
+    src_catalog: ErmrestCatalog,
+    dst_catalog: ErmrestCatalog,
+    root_table: str,
+    root_rid: str,
+    export_paths: list[list[str]],
+    all_tables: list[str],
+    report: "CloneReport",
+    truncate_oversized: bool = False,
+    page_size: int = 1000,
+) -> dict[str, int]:
+    """Copy data using export paths to respect row-level security.
+    Instead of computing reachable RIDs and fetching them individually (which can
+    fail due to row-level ACLs), this function copies data by following the export
+    paths directly. This ensures we only copy rows that are actually visible.
+    Args:
+        src_catalog: Source catalog connection.
+        dst_catalog: Destination catalog connection.
+        root_table: Root table in "schema:table" format.
+        root_rid: Root RID to start from.
+        export_paths: Paths from export annotation.
+        all_tables: All tables to copy (for vocabulary tables not in paths).
+        report: Clone report for recording issues.
+        truncate_oversized: Whether to truncate oversized values.
+        page_size: Number of rows per batch.
+    Returns:
+        Dict mapping table spec -> rows copied.
+    """
+    MAX_INDEX_VALUE_BYTES = 2600
+    TRUNCATE_SUFFIX = "...[TRUNCATED]"
+    rows_by_table: dict[str, int] = {}
+    def truncate_row(row: dict) -> tuple[dict, list[TruncatedValue]]:
+        """Truncate oversized values in a row."""
+        truncated_values = []
+        modified = row.copy()
+        for col, value in row.items():
+            if isinstance(value, str):
+                value_bytes = len(value.encode('utf-8'))
+                if value_bytes > MAX_INDEX_VALUE_BYTES:
+                    max_chars = MAX_INDEX_VALUE_BYTES - len(TRUNCATE_SUFFIX.encode('utf-8'))
+                    truncated = value[:max_chars] + TRUNCATE_SUFFIX
+                    while len(truncated.encode('utf-8')) > MAX_INDEX_VALUE_BYTES:
+                        max_chars -= 100
+                        truncated = value[:max_chars] + TRUNCATE_SUFFIX
+                    modified[col] = truncated
+                    truncated_values.append(TruncatedValue(
+                        table="",
+                        rid=str(row.get('RID', 'unknown')),
+                        column=col,
+                        original_bytes=value_bytes,
+                        truncated_bytes=len(truncated.encode('utf-8')),
+                    ))
+        return modified, truncated_values
+    def copy_with_path(path_query: str, table_spec: str) -> int:
+        """Copy data using a path query, returning rows copied."""
+        sname, tname = table_spec.split(":", 1)
+        tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
+        rows_copied = 0
+        last_rid = ""
+        while True:
+            # Add pagination
+            if last_rid:
+                query = f"{path_query}@sort(RID)@after({urlquote(last_rid)})?limit={page_size}"
+            else:
+                query = f"{path_query}@sort(RID)?limit={page_size}"
+            try:
+                page = src_catalog.get(query).json()
+            except Exception as e:
+                logger.warning(f"Path query failed {path_query}: {e}")
+                break
+            if not page:
+                break
+            # Process rows
+            rows_to_insert = []
+            for row in page:
+                if truncate_oversized:
+                    modified, _ = truncate_row(row)
+                    rows_to_insert.append(modified)
+                else:
+                    rows_to_insert.append(row)
+            # Insert
+            try:
+                dst_catalog.post(f"/entity/{tname_uri}", json=rows_to_insert)
+                rows_copied += len(rows_to_insert)
+            except Exception as e:
+                # Try row-by-row on failure
+                for row in rows_to_insert:
+                    try:
+                        dst_catalog.post(f"/entity/{tname_uri}", json=[row])
+                        rows_copied += 1
+                    except Exception:
+                        logger.debug(f"Failed to insert row: {e}")
+            if len(page) < page_size:
+                break
+            last_rid = page[-1].get("RID", "")
+            if not last_rid:
+                break
+        return rows_copied
+    # Copy root table (just the root row)
+    root_sname, root_tname = root_table.split(":", 1)
+    root_uri = f"{urlquote(root_sname)}:{urlquote(root_tname)}"
+    try:
+        root_row = src_catalog.get(f"/entity/{root_uri}/RID={urlquote(root_rid)}").json()
+        if root_row:
+            dst_catalog.post(f"/entity/{root_uri}", json=root_row)
+            rows_by_table[root_table] = 1
+            logger.info(f"Copied 1 row for {root_table}")
+    except Exception as e:
+        logger.warning(f"Failed to copy root row: {e}")
+    # Copy data for each export path
+    tables_copied = {root_table}
+    for path in export_paths:
+        if len(path) < 2:
+            continue
+        # Build the path query starting from root
+        query = f"/entity/{_quote_table_spec(root_table)}/RID={urlquote(root_rid)}"
+        for table_spec in path[1:]:
+            query += f"/{_quote_table_spec(table_spec)}"
+        target_table = path[-1]
+        if target_table in tables_copied:
+            continue
+        rows = copy_with_path(query, target_table)
+        rows_by_table[target_table] = rows_by_table.get(target_table, 0) + rows
+        tables_copied.add(target_table)
+        logger.info(f"Copied {rows} rows for {target_table}")
+    # Copy vocabulary tables (full copy since they're not in paths)
+    for table_spec in all_tables:
+        if table_spec in tables_copied:
+            continue
+        # Check if it's a vocabulary table
+        sname, tname = table_spec.split(":", 1)
+        if sname.startswith("vocab") or "vocab" in sname.lower():
+            # Full copy of vocabulary table
+            query = f"/entity/{_quote_table_spec(table_spec)}"
+            rows = copy_with_path(query, table_spec)
+            rows_by_table[table_spec] = rows
+            tables_copied.add(table_spec)
+            logger.info(f"Copied {rows} rows for vocabulary table {table_spec}")
+    return rows_by_table
+def _copy_subset_table_data(
+    src_catalog: ErmrestCatalog,
+    dst_catalog: ErmrestCatalog,
+    sname: str,
+    tname: str,
+    reachable_rids: set[str],
+    page_size: int,
+    report: "CloneReport",
+    truncate_oversized: bool = False,
+) -> tuple[int, int, list[str], list[TruncatedValue]]:
+    """Copy only rows with RIDs in reachable_rids from source to destination.
+    Similar to _copy_table_data_with_retry but filters to only reachable RIDs.
+    Args:
+        src_catalog: Source catalog connection.
+        dst_catalog: Destination catalog connection.
+        sname: Schema name.
+        tname: Table name.
+        reachable_rids: Set of RIDs to copy.
+        page_size: Number of rows to fetch per request.
+        report: Clone report for recording issues.
+        truncate_oversized: Whether to truncate oversized values.
+    Returns:
+        Tuple of (rows_copied, rows_skipped, skipped_rids, truncated_values).
+    """
+    tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
+    table_key = f"{sname}:{tname}"
+    MAX_INDEX_VALUE_BYTES = 2600
+    TRUNCATE_SUFFIX = "...[TRUNCATED]"
+    rows_copied = 0
+    rows_skipped = 0
+    skipped_rids: list[str] = []
+    truncated_values: list[TruncatedValue] = []
+    if not reachable_rids:
+        return 0, 0, [], []
+    # Convert RIDs to sorted list for pagination
+    rid_list = sorted(reachable_rids)
+    # Process in batches
+    for i in range(0, len(rid_list), page_size):
+        batch_rids = rid_list[i:i + page_size]
+        # Build query with RID filter
+        rid_filter = ",".join(urlquote(rid) for rid in batch_rids)
+        try:
+            page = src_catalog.get(f"/entity/{tname_uri}/RID=any({rid_filter})").json()
+        except Exception as e:
+            logger.warning(f"Failed to fetch batch from {table_key}: {e}")
+            rows_skipped += len(batch_rids)
+            skipped_rids.extend(batch_rids)
+            continue
+        if not page:
+            continue
+        # Optionally truncate oversized values
+        rows_to_insert = []
+        for row in page:
+            if truncate_oversized:
+                modified_row = row.copy()
+                for col, value in row.items():
+                    if isinstance(value, str):
+                        value_bytes = len(value.encode('utf-8'))
+                        if value_bytes > MAX_INDEX_VALUE_BYTES:
+                            max_chars = MAX_INDEX_VALUE_BYTES - len(TRUNCATE_SUFFIX.encode('utf-8'))
+                            truncated = value[:max_chars] + TRUNCATE_SUFFIX
+                            while len(truncated.encode('utf-8')) > MAX_INDEX_VALUE_BYTES:
+                                max_chars -= 100
+                                truncated = value[:max_chars] + TRUNCATE_SUFFIX
+                            modified_row[col] = truncated
+                            truncated_values.append(TruncatedValue(
+                                table=table_key,
+                                rid=str(row.get('RID', 'unknown')),
+                                column=col,
+                                original_bytes=value_bytes,
+                                truncated_bytes=len(truncated.encode('utf-8')),
+                            ))
+                rows_to_insert.append(modified_row)
+            else:
+                rows_to_insert.append(row)
+        # Insert into destination
+        try:
+            dst_catalog.post(f"/entity/{tname_uri}", json=rows_to_insert)
+            rows_copied += len(rows_to_insert)
+        except Exception as e:
+            error_str = str(e)
+            if "index row size" in error_str.lower() or "btree" in error_str.lower():
+                # Row-by-row fallback for index size errors
+                for row in rows_to_insert:
+                    try:
+                        dst_catalog.post(f"/entity/{tname_uri}", json=[row])
+                        rows_copied += 1
+                    except Exception:
+                        rows_skipped += 1
+                        skipped_rids.append(str(row.get('RID', 'unknown')))
+            else:
+                logger.warning(f"Failed to insert into {table_key}: {e}")
+                rows_skipped += len(rows_to_insert)
+                skipped_rids.extend(str(row.get('RID', 'unknown')) for row in rows_to_insert)
+    return rows_copied, rows_skipped, skipped_rids, truncated_values
 def clone_catalog(
     source_hostname: str,
     source_catalog_id: str,
@@ -1072,7 +2055,7 @@ def clone_catalog(
     clone_timestamp = datetime.now(timezone.utc).isoformat()
     # Perform the three-stage clone
-    orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values = _clone_three_stage(
+    orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, skipped_rids, truncated_values = _clone_three_stage(
         src_catalog=src_catalog,
         dst_catalog=dst_catalog,
         copy_data=not schema_only,
@@ -1136,8 +2119,13 @@ def clone_catalog(
         asset_mode=asset_mode.value,
         exclude_schemas=exclude_schemas or [],
         exclude_objects=exclude_objects or [],
+        add_ml_schema=add_ml_schema,
+        copy_annotations=copy_annotations,
+        copy_policy=copy_policy,
+        reinitialize_dataset_versions=reinitialize_dataset_versions,
         rows_copied=total_rows_copied,
         rows_skipped=rows_skipped,
+        skipped_rids=skipped_rids,
         truncated_count=len(truncated_values),
         orphan_rows_removed=orphan_rows_removed,
         orphan_rows_nullified=orphan_rows_nullified,
@@ -1186,10 +2174,10 @@ def _clone_three_stage(
     prune_hidden_fkeys: bool,
     truncate_oversized: bool,
     report: CloneReport,
-) -> tuple[int, int, int, int, list[TruncatedValue]]:
+) -> tuple[int, int, int, int, list[str], list[TruncatedValue]]:
     """Perform three-stage catalog cloning.
-    Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values)
+    Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, skipped_rids, truncated_values)
     """
     src_model = src_catalog.getCatalogModel()
@@ -1282,7 +2270,7 @@ def _clone_three_stage(
                 report.add_issue(CloneIssue(
                     severity=CloneIssueSeverity.WARNING,
                     category=CloneIssueCategory.FK_PRUNED,
-                    message=f"FK pruned due to hidden reference data",
+                    message="FK pruned due to hidden reference data",
                     table=f"{sname}:{tname}",
                     details=f"FK {fk_name} references columns with 'select': null",
                     action="Source catalog may have incoherent policies",
@@ -1328,6 +2316,7 @@ def _clone_three_stage(
     # Stage 2: Copy data
     total_rows = 0
     total_rows_skipped = 0
+    all_skipped_rids: list[str] = []
     all_truncated_values: list[TruncatedValue] = []
     deferred_indexes: dict[str, list[dict]] = {}  # Track indexes dropped for later rebuild
@@ -1343,7 +2332,7 @@ def _clone_three_stage(
             logger.debug(f"Copying data for {table_key}")
             # Use the new copy function with index error handling
-            table_rows, rows_skipped, truncated = _copy_table_data_with_retry(
+            table_rows, rows_skipped, skipped_rids, truncated = _copy_table_data_with_retry(
                 src_catalog=src_catalog,
                 dst_catalog=dst_catalog,
                 sname=sname,
@@ -1355,6 +2344,7 @@ def _clone_three_stage(
             )
             total_rows_skipped += rows_skipped
+            all_skipped_rids.extend(skipped_rids)
             all_truncated_values.extend(truncated)
             if table_rows < 0:
@@ -1581,7 +2571,7 @@ def _clone_three_stage(
     if copy_annotations or copy_policy:
         _copy_configuration(src_model, dst_catalog, copy_annotations, copy_policy, exclude_schemas, excluded_tables)
-    return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_truncated_values
+    return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_skipped_rids, all_truncated_values
 def _identify_orphan_values(
@@ -1684,7 +2674,7 @@ def _delete_orphan_rows(
         report.add_issue(CloneIssue(
             severity=CloneIssueSeverity.WARNING,
             category=CloneIssueCategory.ORPHAN_ROWS,
-            message=f"Orphan rows deleted",
+            message="Orphan rows deleted",
             table=table_key,
             details=f"Missing references to: {ref_key} ({len(orphan_values)})",
             action="Source catalog may have incoherent row-level policies",
@@ -1892,10 +2882,27 @@ def _post_clone_operations(
     if add_ml_schema:
         try:
-            from deriva_ml.schema import create_ml_schema
             catalog = server.connect_ermrest(result.catalog_id)
             create_ml_schema(catalog)
             result.ml_schema_added = True
+            # Apply catalog annotations (chaise-config, navbar, etc.)
+            # Import DerivaML locally to avoid circular import (deriva_ml.__init__ imports from clone.py)
+            try:
+                from deriva_ml import DerivaML
+                ml = DerivaML(result.hostname, result.catalog_id, check_auth=False)
+                ml.apply_catalog_annotations()
+                logger.info("Applied catalog annotations (chaise-config, navbar)")
+            except Exception as e:
+                logger.warning(f"Failed to apply catalog annotations: {e}")
+                if result.report:
+                    result.report.add_issue(CloneIssue(
+                        severity=CloneIssueSeverity.WARNING,
+                        category=CloneIssueCategory.SCHEMA_ISSUE,
+                        message="Failed to apply catalog annotations",
+                        details=str(e),
+                        action="Manually call apply_catalog_annotations() after clone",
+                    ))
         except Exception as e:
             logger.warning(f"Failed to add ML schema: {e}")
             if result.report:
@@ -1945,3 +2952,549 @@ def _reinitialize_dataset_versions(
         logger.warning(f"Failed to reinitialize dataset versions: {e}")
     return result
+# =============================================================================
+# Clone Subset Catalog
+# =============================================================================
+def clone_subset_catalog(
+    source_hostname: str,
+    source_catalog_id: str,
+    root_rid: str,
+    include_tables: list[str] | None = None,
+    exclude_objects: list[str] | None = None,
+    exclude_schemas: list[str] | None = None,
+    include_associations: bool = True,
+    include_vocabularies: bool = True,
+    use_export_annotation: bool = False,
+    dest_hostname: str | None = None,
+    alias: str | None = None,
+    add_ml_schema: bool = False,
+    asset_mode: AssetCopyMode = AssetCopyMode.REFERENCES,
+    copy_annotations: bool = True,
+    copy_policy: bool = True,
+    source_credential: dict | None = None,
+    dest_credential: dict | None = None,
+    orphan_strategy: OrphanStrategy = OrphanStrategy.FAIL,
+    prune_hidden_fkeys: bool = False,
+    truncate_oversized: bool = False,
+    reinitialize_dataset_versions: bool = True,
+) -> CloneCatalogResult:
+    """Clone a subset of a catalog containing only data reachable from a root RID.
+    Can use either FK graph traversal or export annotations to determine which
+    tables to include. When use_export_annotation=True, the tables and paths
+    defined in the root table's export annotation (tag:isrd.isi.edu,2019:export)
+    are used, which matches the behavior of the BDBag export button.
+    Args:
+        source_hostname: Hostname of the source catalog server.
+        source_catalog_id: ID of the catalog to clone from.
+        root_rid: The starting RID from which to trace reachability.
+        include_tables: Optional list of additional table names ("schema:table"
+            format) to use as starting points for table discovery. If None,
+            discovery starts only from the root RID's table.
+        exclude_objects: List of tables ("schema:table" format) to exclude from
+            cloning. Paths through these tables are not followed.
+        exclude_schemas: List of schema names to exclude entirely from cloning.
+        include_associations: If True, auto-include association tables needed
+            for FK integrity between discovered tables.
+        include_vocabularies: If True, auto-include vocabulary tables referenced
+            by discovered tables.
+        use_export_annotation: If True, use the export annotation on the root
+            table to determine which tables and paths to clone. This matches the
+            behavior of the BDBag export button. If False (default), discover
+            tables via FK graph traversal.
+        dest_hostname: Destination hostname. If None, uses source_hostname.
+        alias: Optional alias for the new catalog.
+        add_ml_schema: If True, add DerivaML schema to clone.
+        asset_mode: How to handle assets (NONE, REFERENCES, or FULL).
+        copy_annotations: If True, copy annotations to clone.
+        copy_policy: If True, copy ACLs/ACL bindings to clone.
+        source_credential: Credentials for source catalog.
+        dest_credential: Credentials for destination catalog.
+        orphan_strategy: How to handle orphan rows (FAIL, DELETE, or NULLIFY).
+        prune_hidden_fkeys: If True, prune FKs with hidden reference data.
+        truncate_oversized: If True, truncate values that exceed index size limits.
+        reinitialize_dataset_versions: If True, reinitialize dataset versions.
+    Returns:
+        CloneCatalogResult with details of the cloned catalog.
+    Raises:
+        ValueError: If root_rid is not found in any table.
+        ValueError: If include_tables contains invalid table specifications.
+        ValueError: If use_export_annotation=True but no export annotation found.
+    Example:
+        >>> # Clone using export annotation (matches BDBag export)
+        >>> result = clone_subset_catalog(
+        ...     source_hostname="www.facebase.org",
+        ...     source_catalog_id="1",
+        ...     root_rid="3-HXMC",
+        ...     use_export_annotation=True,
+        ...     alias="my-project-clone",
+        ... )
+        >>> # Clone all tables reachable from a dataset, excluding Execution table
+        >>> result = clone_subset_catalog(
+        ...     source_hostname="www.example.org",
+        ...     source_catalog_id="1",
+        ...     root_rid="ABC123",
+        ...     exclude_objects=["deriva-ml:Execution"],
+        ...     alias="my-subset",
+        ... )
+        >>> print(f"Created catalog {result.catalog_id}")
+        >>> # Clone with additional starting tables
+        >>> result = clone_subset_catalog(
+        ...     source_hostname="www.example.org",
+        ...     source_catalog_id="1",
+        ...     root_rid="ABC123",
+        ...     include_tables=["demo:Configuration"],  # Also discover from here
+        ...     exclude_schemas=["audit"],
+        ... )
+    """
+    include_tables = include_tables or []
+    exclude_objects = exclude_objects or []
+    exclude_schemas_set = set(exclude_schemas) if exclude_schemas else set()
+    # Validate table format for include_tables
+    for table_spec in include_tables:
+        if ":" not in table_spec:
+            raise ValueError(f"Table must be specified as 'schema:table', got: {table_spec}")
+    # Parse exclude_objects into set of tuples
+    excluded_tables: set[tuple[str, str]] = set()
+    for table_spec in exclude_objects:
+        if ":" not in table_spec:
+            raise ValueError(f"exclude_objects entries must be 'schema:table', got: {table_spec}")
+        schema, table = table_spec.split(":", 1)
+        excluded_tables.add((schema, table))
+    dest_hostname = dest_hostname or source_hostname
+    # Get credentials
+    src_cred = source_credential or get_credential(source_hostname)
+    dst_cred = dest_credential or get_credential(dest_hostname)
+    # Connect to source catalog
+    src_server = DerivaServer("https", source_hostname, credentials=src_cred)
+    src_catalog = src_server.connect_ermrest(source_catalog_id)
+    src_model = src_catalog.getCatalogModel()
+    logger.info(f"Connected to source catalog {source_hostname}/{source_catalog_id}")
+    # First, find the table containing the root RID
+    root_table_key = None
+    for sname, schema in src_model.schemas.items():
+        if sname in {"public", "_acl_admin", "WWW"} or sname in exclude_schemas_set:
+            continue
+        for tname, table in schema.tables.items():
+            if (sname, tname) in excluded_tables:
+                continue
+            if table.kind != 'table' or 'RID' not in table.column_definitions.elements:
+                continue
+            try:
+                table_spec = f"{sname}:{tname}"
+                uri = f"/entity/{_quote_table_spec(table_spec)}/RID={urlquote(root_rid)}"
+                result = src_catalog.get(uri).json()
+                if result:
+                    root_table_key = table_spec
+                    break
+            except Exception:
+                continue
+        if root_table_key:
+            break
+    if root_table_key is None:
+        raise ValueError(f"Root RID {root_rid} not found in any accessible table")
+    logger.info(f"Root RID {root_rid} found in table {root_table_key}")
+    # Get the root table object for export annotation parsing
+    root_schema, root_tname = root_table_key.split(":", 1)
+    root_table_obj = src_model.schemas[root_schema].tables[root_tname]
+    # Track paths for efficient RID computation (when using export annotation)
+    export_paths: list[list[str]] = []
+    if use_export_annotation:
+        # Use export annotation to determine tables
+        logger.info("Using export annotation to determine tables...")
+        discovered_tables, export_paths = _parse_export_annotation_tables(
+            root_table_obj, []
+        )
+        if not discovered_tables or len(discovered_tables) <= 1:
+            raise ValueError(
+                f"No export annotation found on table {root_table_key} or annotation "
+                f"contains no paths. Set use_export_annotation=False to use FK graph traversal."
+            )
+        logger.info(f"Export annotation defines {len(discovered_tables)} tables and {len(export_paths)} paths")
+        # Add any explicitly included tables
+        for table_spec in (include_tables or []):
+            if table_spec not in discovered_tables:
+                discovered_tables.append(table_spec)
+        # Filter out excluded tables
+        discovered_tables = [
+            t for t in discovered_tables
+            if tuple(t.split(":", 1)) not in excluded_tables
+        ]
+    else:
+        # Build starting tables: root table + any explicitly included tables
+        start_tables = [root_table_key]
+        for table_spec in include_tables:
+            if table_spec not in start_tables:
+                start_tables.append(table_spec)
+        # Discover all reachable tables from starting points using FK traversal
+        logger.info(f"Discovering tables reachable from {start_tables}...")
+        discovered_tables = _discover_reachable_tables(
+            model=src_model,
+            start_tables=start_tables,
+            exclude_tables=excluded_tables,
+            exclude_schemas=exclude_schemas_set,
+        )
+    logger.info(f"Discovered {len(discovered_tables)} connected tables")
+    # Expand with associations and vocabularies
+    all_tables = list(discovered_tables)
+    associations_added: list[str] = []
+    vocabularies_added: list[str] = []
+    if include_associations:
+        all_tables, associations_added = _expand_tables_with_associations(src_model, all_tables)
+        # Filter out excluded tables from associations
+        associations_added = [
+            t for t in associations_added
+            if tuple(t.split(":", 1)) not in excluded_tables
+        ]
+        all_tables = [
+            t for t in all_tables
+            if tuple(t.split(":", 1)) not in excluded_tables
+        ]
+        if associations_added:
+            logger.info(f"Auto-added association tables: {associations_added}")
+    if include_vocabularies:
+        all_tables, vocabularies_added = _expand_tables_with_vocabularies(src_model, all_tables)
+        # Filter out excluded tables from vocabularies
+        vocabularies_added = [
+            t for t in vocabularies_added
+            if tuple(t.split(":", 1)) not in excluded_tables
+        ]
+        all_tables = [
+            t for t in all_tables
+            if tuple(t.split(":", 1)) not in excluded_tables
+        ]
+        if vocabularies_added:
+            logger.info(f"Auto-added vocabulary tables: {vocabularies_added}")
+    logger.info(f"Will clone {len(all_tables)} tables: {all_tables}")
+    # Compute reachable RIDs
+    logger.info(f"Computing reachable rows from root RID {root_rid}...")
+    if use_export_annotation and export_paths:
+        # Use the predefined paths from export annotation (more efficient)
+        # Also pass model to enable FK reference expansion
+        reachable_rids = _compute_reachable_rids_from_paths(
+            src_catalog, root_rid, root_table_key, export_paths, all_tables, src_model
+        )
+    else:
+        # Use FK graph traversal
+        reachable_rids = _compute_reachable_rids(src_catalog, root_rid, all_tables, src_model)
+    total_rows = sum(len(rids) for rids in reachable_rids.values())
+    logger.info(f"Found {total_rows} reachable rows across {len(all_tables)} tables")
+    for table_spec, rids in reachable_rids.items():
+        if rids:
+            logger.debug(f"  {table_spec}: {len(rids)} rows")
+    # Create report
+    report = CloneReport()
+    # Parse tables into set for quick lookup
+    included_tables: set[tuple[str, str]] = set()
+    for table_spec in all_tables:
+        schema, table = table_spec.split(":", 1)
+        included_tables.add((schema, table))
+    # Create destination catalog
+    dst_server = DerivaServer("https", dest_hostname, credentials=dst_cred)
+    dst_catalog = dst_server.create_ermrest_catalog()
+    dst_catalog_id = dst_catalog.catalog_id
+    logger.info(f"Created destination catalog {dest_hostname}/{dst_catalog_id}")
+    try:
+        # Build model content for included tables only
+        new_model = []
+        fkeys_deferred = []
+        clone_states = {}
+        def prune_parts(d, *extra_victims):
+            victims = set(extra_victims)
+            if not copy_annotations:
+                victims |= {'annotations'}
+            if not copy_policy:
+                victims |= {'acls', 'acl_bindings'}
+            for k in victims:
+                d.pop(k, None)
+            return d
+        # Collect schemas that have included tables
+        included_schemas: set[str] = {schema for schema, _ in included_tables}
+        for sname in included_schemas:
+            if sname not in src_model.schemas:
+                continue
+            schema = src_model.schemas[sname]
+            schema_def = prune_parts(schema.prejson(), 'tables')
+            new_model.append(schema_def)
+            for tname, table in schema.tables.items():
+                if (sname, tname) not in included_tables:
+                    continue
+                if table.kind != 'table':
+                    continue
+                if 'RID' not in table.column_definitions.elements:
+                    logger.warning(f"Table {sname}.{tname} lacks system columns, skipping")
+                    report.tables_skipped.append(f"{sname}:{tname}")
+                    continue
+                # Create table definition without FKs
+                table_def = prune_parts(table.prejson(), 'foreign_keys')
+                table_def['column_definitions'] = [
+                    prune_parts(c) for c in table_def['column_definitions']
+                ]
+                table_def['keys'] = [prune_parts(k) for k in table_def.get('keys', [])]
+                table_def.setdefault('annotations', {})[_clone_state_url] = 1
+                new_model.append(table_def)
+                clone_states[(sname, tname)] = 1
+                # Collect FKs (only those between included tables)
+                for fkdef in table.prejson().get('foreign_keys', []):
+                    include_fk = True
+                    for ref_col in fkdef.get('referenced_columns', []):
+                        ref_schema = ref_col.get('schema_name')
+                        ref_table = ref_col.get('table_name')
+                        if (ref_schema, ref_table) not in included_tables:
+                            include_fk = False
+                            break
+                    if include_fk:
+                        fkeys_deferred.append((sname, tname, prune_parts(fkdef.copy())))
+        # Stage 1: Create schema without FKs
+        logger.info("Stage 1: Creating schema without foreign keys...")
+        if new_model:
+            dst_catalog.post("/schema", json=new_model)
+        # Stage 2: Copy filtered data
+        logger.info("Stage 2: Copying filtered data...")
+        total_rows_copied = 0
+        total_rows_skipped = 0
+        all_skipped_rids: list[str] = []
+        all_truncated_values: list[TruncatedValue] = []
+        page_size = 1000
+        if use_export_annotation and export_paths:
+            # Use path-based copying to respect row-level security
+            logger.info("Using path-based copying (respects row-level ACLs)...")
+            rows_by_table = _copy_data_via_export_paths(
+                src_catalog=src_catalog,
+                dst_catalog=dst_catalog,
+                root_table=root_table_key,
+                root_rid=root_rid,
+                export_paths=export_paths,
+                all_tables=all_tables,
+                report=report,
+                truncate_oversized=truncate_oversized,
+                page_size=page_size,
+            )
+            for table_key, rows in rows_by_table.items():
+                report.tables_restored[table_key] = rows
+                total_rows_copied += rows
+            # Mark all tables complete
+            for (sname, tname), state in clone_states.items():
+                if state == 1:
+                    try:
+                        dst_catalog.put(
+                            f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/annotation/{urlquote(_clone_state_url)}",
+                            json=2
+                        )
+                    except Exception:
+                        pass
+        else:
+            # Use RID-based copying (original approach)
+            for (sname, tname), state in clone_states.items():
+                if state != 1:
+                    continue
+                table_key = f"{sname}:{tname}"
+                table_reachable = reachable_rids.get(table_key, set())
+                if not table_reachable:
+                    logger.debug(f"No reachable rows for {table_key}")
+                    report.tables_restored[table_key] = 0
+                    continue
+                logger.debug(f"Copying {len(table_reachable)} rows for {table_key}")
+                rows_copied, rows_skipped, skipped, truncated = _copy_subset_table_data(
+                    src_catalog=src_catalog,
+                    dst_catalog=dst_catalog,
+                    sname=sname,
+                    tname=tname,
+                    reachable_rids=table_reachable,
+                    page_size=page_size,
+                    report=report,
+                    truncate_oversized=truncate_oversized,
+                )
+                total_rows_copied += rows_copied
+                total_rows_skipped += rows_skipped
+                all_skipped_rids.extend(skipped)
+                all_truncated_values.extend(truncated)
+                report.tables_restored[table_key] = rows_copied
+                # Mark complete
+                try:
+                    dst_catalog.put(
+                        f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/annotation/{urlquote(_clone_state_url)}",
+                        json=2
+                    )
+                except Exception:
+                    pass
+        logger.info(f"Copied {total_rows_copied} rows, skipped {total_rows_skipped}")
+        # Stage 3: Apply FKs
+        logger.info(f"Stage 3: Applying {len(fkeys_deferred)} foreign keys...")
+        fkeys_applied = 0
+        fkeys_failed = 0
+        for sname, tname, fkdef in fkeys_deferred:
+            fk_name = fkdef.get('names', [[sname, 'unknown']])[0]
+            try:
+                dst_catalog.post(
+                    f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/foreignkey",
+                    json=fkdef
+                )
+                fkeys_applied += 1
+                report.fkeys_applied += 1
+            except Exception as e:
+                error_str = str(e)
+                if "violates foreign key constraint" in error_str:
+                    if orphan_strategy == OrphanStrategy.FAIL:
+                        report.add_issue(CloneIssue(
+                            severity=CloneIssueSeverity.ERROR,
+                            category=CloneIssueCategory.FK_VIOLATION,
+                            message="FK constraint violation",
+                            table=f"{sname}:{tname}",
+                            details=f"FK {fk_name}: {error_str[:200]}",
+                            action="Some reachable rows may have dangling references",
+                        ))
+                    fkeys_failed += 1
+                    report.fkeys_failed += 1
+                else:
+                    logger.warning(f"Failed to apply FK {fk_name}: {e}")
+                    fkeys_failed += 1
+                    report.fkeys_failed += 1
+        logger.info(f"Applied {fkeys_applied} FKs, failed {fkeys_failed}")
+        # Build result
+        result = CloneCatalogResult(
+            catalog_id=dst_catalog_id,
+            hostname=dest_hostname,
+            schema_only=False,
+            asset_mode=asset_mode,
+            source_hostname=source_hostname,
+            source_catalog_id=source_catalog_id,
+            source_snapshot=None,
+            alias=alias,
+            orphan_rows_removed=0,
+            orphan_rows_nullified=0,
+            fkeys_pruned=0,
+            rows_skipped=total_rows_skipped,
+            truncated_values=all_truncated_values,
+            report=report,
+        )
+        # Post-clone operations
+        if alias:
+            try:
+                dst_server.create_ermrest_alias(id=alias, alias_target=str(dst_catalog_id))
+                result.alias = alias
+            except Exception as e:
+                logger.warning(f"Failed to create alias '{alias}': {e}")
+        if add_ml_schema:
+            try:
+                create_ml_schema(dst_catalog)
+                result.ml_schema_added = True
+                # Apply catalog annotations (chaise-config, navbar, etc.)
+                # Import DerivaML locally to avoid circular import (deriva_ml.__init__ imports from clone.py)
+                try:
+                    from deriva_ml import DerivaML
+                    ml = DerivaML(dest_hostname, str(dst_catalog_id), check_auth=False)
+                    ml.apply_catalog_annotations()
+                    logger.info("Applied catalog annotations (chaise-config, navbar)")
+                except Exception as e:
+                    logger.warning(f"Failed to apply catalog annotations: {e}")
+            except Exception as e:
+                logger.warning(f"Failed to add ML schema: {e}")
+        if reinitialize_dataset_versions and "deriva-ml" in src_model.schemas:
+            result = _reinitialize_dataset_versions(result, dst_cred)
+        # Set defaultTable to the root table for partial clones
+        # This ensures the Chaise UI has a valid landing page
+        try:
+            chaise_config_url = "tag:isrd.isi.edu,2019:chaise-config"
+            dst_model = dst_catalog.getCatalogModel()
+            dst_model.annotations[chaise_config_url] = dst_model.annotations.get(chaise_config_url, {})
+            # Chaise expects defaultTable as an object with schema and table keys
+            root_schema, root_tname = root_table_key.split(":", 1)
+            dst_model.annotations[chaise_config_url]["defaultTable"] = {
+                "schema": root_schema,
+                "table": root_tname,
+            }
+            dst_model.apply()
+            logger.info(f"Set defaultTable to {root_table_key}")
+        except Exception as e:
+            logger.warning(f"Failed to set defaultTable annotation: {e}")
+        logger.info(
+            f"Subset clone complete: {dest_hostname}/{dst_catalog_id} "
+            f"({total_rows_copied} rows in {len(clone_states)} tables)"
+        )
+        return result
+    except Exception as e:
+        # Clean up on failure
+        logger.error(f"Clone failed: {e}")
+        try:
+            dst_server.delete_ermrest_catalog(dst_catalog_id)
+            logger.info(f"Cleaned up failed catalog {dst_catalog_id}")
+        except Exception:
+            pass
+        raise

deriva-ml 1.17.14__py3-none-any.whl → 1.17.16__py3-none-any.whl

deriva-ml 1.17.14py3-none-any.whl → 1.17.16py3-none-any.whl