deriva-ml 1.17.14__py3-none-any.whl → 1.17.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. deriva_ml/__init__.py +2 -2
  2. deriva_ml/asset/asset.py +0 -4
  3. deriva_ml/catalog/__init__.py +6 -0
  4. deriva_ml/catalog/clone.py +1591 -38
  5. deriva_ml/catalog/localize.py +66 -29
  6. deriva_ml/core/base.py +12 -9
  7. deriva_ml/core/definitions.py +13 -12
  8. deriva_ml/core/ermrest.py +11 -12
  9. deriva_ml/core/mixins/annotation.py +2 -2
  10. deriva_ml/core/mixins/asset.py +3 -3
  11. deriva_ml/core/mixins/dataset.py +3 -3
  12. deriva_ml/core/mixins/execution.py +1 -0
  13. deriva_ml/core/mixins/feature.py +2 -2
  14. deriva_ml/core/mixins/file.py +2 -2
  15. deriva_ml/core/mixins/path_builder.py +2 -2
  16. deriva_ml/core/mixins/rid_resolution.py +2 -2
  17. deriva_ml/core/mixins/vocabulary.py +2 -2
  18. deriva_ml/core/mixins/workflow.py +3 -3
  19. deriva_ml/dataset/catalog_graph.py +3 -4
  20. deriva_ml/dataset/dataset.py +5 -3
  21. deriva_ml/dataset/dataset_bag.py +0 -2
  22. deriva_ml/dataset/upload.py +2 -2
  23. deriva_ml/demo_catalog.py +0 -1
  24. deriva_ml/execution/__init__.py +8 -8
  25. deriva_ml/execution/base_config.py +2 -2
  26. deriva_ml/execution/execution.py +5 -3
  27. deriva_ml/execution/execution_record.py +0 -1
  28. deriva_ml/execution/model_protocol.py +1 -1
  29. deriva_ml/execution/multirun_config.py +0 -1
  30. deriva_ml/execution/runner.py +3 -3
  31. deriva_ml/experiment/experiment.py +3 -3
  32. deriva_ml/feature.py +2 -2
  33. deriva_ml/interfaces.py +2 -2
  34. deriva_ml/model/__init__.py +45 -24
  35. deriva_ml/model/annotations.py +0 -1
  36. deriva_ml/model/catalog.py +3 -2
  37. deriva_ml/model/data_loader.py +330 -0
  38. deriva_ml/model/data_sources.py +439 -0
  39. deriva_ml/model/database.py +216 -32
  40. deriva_ml/model/fk_orderer.py +379 -0
  41. deriva_ml/model/handles.py +1 -1
  42. deriva_ml/model/schema_builder.py +816 -0
  43. deriva_ml/run_model.py +3 -3
  44. deriva_ml/schema/annotations.py +2 -1
  45. deriva_ml/schema/create_schema.py +1 -1
  46. deriva_ml/schema/validation.py +1 -1
  47. {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/METADATA +1 -1
  48. deriva_ml-1.17.16.dist-info/RECORD +81 -0
  49. deriva_ml-1.17.14.dist-info/RECORD +0 -77
  50. {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/WHEEL +0 -0
  51. {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/entry_points.txt +0 -0
  52. {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/licenses/LICENSE +0 -0
  53. {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/top_level.txt +0 -0
@@ -20,7 +20,7 @@ from __future__ import annotations
20
20
 
21
21
  import json
22
22
  import logging
23
- from dataclasses import dataclass, field
23
+ from dataclasses import asdict, dataclass, field
24
24
  from datetime import datetime, timezone
25
25
  from enum import Enum
26
26
  from typing import Any
@@ -29,6 +29,9 @@ from urllib.parse import quote as urlquote
29
29
  from deriva.core import DerivaServer, ErmrestCatalog, get_credential
30
30
  from deriva.core.hatrac_store import HatracStore
31
31
 
32
+ from deriva_ml.model.catalog import VOCAB_COLUMNS
33
+ from deriva_ml.schema import create_ml_schema
34
+
32
35
  logger = logging.getLogger("deriva_ml")
33
36
 
34
37
 
@@ -87,9 +90,10 @@ class CloneIssue:
87
90
  details: str | None = None
88
91
  action: str | None = None
89
92
  row_count: int = 0
93
+ skipped_rids: list[str] | None = None # RIDs of rows that were skipped
90
94
 
91
95
  def to_dict(self) -> dict[str, Any]:
92
- return {
96
+ result = {
93
97
  "severity": self.severity.value,
94
98
  "category": self.category.value,
95
99
  "message": self.message,
@@ -98,6 +102,9 @@ class CloneIssue:
98
102
  "action": self.action,
99
103
  "row_count": self.row_count,
100
104
  }
105
+ if self.skipped_rids:
106
+ result["skipped_rids"] = self.skipped_rids
107
+ return result
101
108
 
102
109
  def __str__(self) -> str:
103
110
  parts = [f"[{self.severity.value.upper()}]"]
@@ -106,7 +113,32 @@ class CloneIssue:
106
113
  parts.append(self.message)
107
114
  if self.row_count > 0:
108
115
  parts.append(f"({self.row_count} rows)")
109
- return " ".join(parts)
116
+ result = " ".join(parts)
117
+ if self.skipped_rids:
118
+ # For small numbers, list the RIDs; for large numbers, just show count
119
+ if len(self.skipped_rids) <= 5:
120
+ result += f"\n Skipped RIDs: {', '.join(self.skipped_rids)}"
121
+ else:
122
+ result += f"\n Skipped RIDs: {len(self.skipped_rids)} rows (see JSON for full list)"
123
+ return result
124
+
125
+
126
+ @dataclass
127
+ class CloneReportSummary:
128
+ """Summary statistics for a clone operation."""
129
+
130
+ total_issues: int
131
+ errors: int
132
+ warnings: int
133
+ tables_restored: int
134
+ tables_failed: int
135
+ tables_skipped: int
136
+ total_rows_restored: int
137
+ orphan_rows_removed: int
138
+ orphan_rows_nullified: int
139
+ fkeys_applied: int
140
+ fkeys_failed: int
141
+ fkeys_pruned: int
110
142
 
111
143
 
112
144
  @dataclass
@@ -134,27 +166,32 @@ class CloneReport:
134
166
  def add_issue(self, issue: CloneIssue) -> None:
135
167
  self.issues.append(issue)
136
168
 
169
+ @property
170
+ def summary(self) -> CloneReportSummary:
171
+ """Return summary statistics as a dataclass."""
172
+ return CloneReportSummary(
173
+ total_issues=len(self.issues),
174
+ errors=len([i for i in self.issues if i.severity == CloneIssueSeverity.ERROR]),
175
+ warnings=len([i for i in self.issues if i.severity == CloneIssueSeverity.WARNING]),
176
+ tables_restored=len(self.tables_restored),
177
+ tables_failed=len(self.tables_failed),
178
+ tables_skipped=len(self.tables_skipped),
179
+ total_rows_restored=sum(self.tables_restored.values()),
180
+ orphan_rows_removed=sum(
181
+ d.get("rows_removed", 0) for d in self.orphan_details.values()
182
+ ),
183
+ orphan_rows_nullified=sum(
184
+ d.get("rows_nullified", 0) for d in self.orphan_details.values()
185
+ ),
186
+ fkeys_applied=self.fkeys_applied,
187
+ fkeys_failed=self.fkeys_failed,
188
+ fkeys_pruned=self.fkeys_pruned,
189
+ )
190
+
137
191
  def to_dict(self) -> dict[str, Any]:
138
192
  """Return the report as a JSON-serializable dictionary."""
139
193
  return {
140
- "summary": {
141
- "total_issues": len(self.issues),
142
- "errors": len([i for i in self.issues if i.severity == CloneIssueSeverity.ERROR]),
143
- "warnings": len([i for i in self.issues if i.severity == CloneIssueSeverity.WARNING]),
144
- "tables_restored": len(self.tables_restored),
145
- "tables_failed": len(self.tables_failed),
146
- "tables_skipped": len(self.tables_skipped),
147
- "total_rows_restored": sum(self.tables_restored.values()),
148
- "orphan_rows_removed": sum(
149
- d.get("rows_removed", 0) for d in self.orphan_details.values()
150
- ),
151
- "orphan_rows_nullified": sum(
152
- d.get("rows_nullified", 0) for d in self.orphan_details.values()
153
- ),
154
- "fkeys_applied": self.fkeys_applied,
155
- "fkeys_failed": self.fkeys_failed,
156
- "fkeys_pruned": self.fkeys_pruned,
157
- },
194
+ "summary": asdict(self.summary),
158
195
  "issues": [i.to_dict() for i in self.issues],
159
196
  "tables_restored": self.tables_restored,
160
197
  "tables_failed": self.tables_failed,
@@ -332,6 +369,7 @@ class CloneDetails:
332
369
  source_catalog_id: str
333
370
  source_snapshot: str | None = None
334
371
  source_schema_url: str | None = None # Hatrac URL to source schema JSON
372
+ # Clone parameters
335
373
  orphan_strategy: str = "fail"
336
374
  truncate_oversized: bool = False
337
375
  prune_hidden_fkeys: bool = False
@@ -339,15 +377,21 @@ class CloneDetails:
339
377
  asset_mode: str = "refs"
340
378
  exclude_schemas: list[str] = field(default_factory=list)
341
379
  exclude_objects: list[str] = field(default_factory=list)
380
+ add_ml_schema: bool = False
381
+ copy_annotations: bool = True
382
+ copy_policy: bool = True
383
+ reinitialize_dataset_versions: bool = True
384
+ # Statistics
342
385
  rows_copied: int = 0
343
386
  rows_skipped: int = 0
387
+ skipped_rids: list[str] = field(default_factory=list) # RIDs of skipped rows
344
388
  truncated_count: int = 0
345
389
  orphan_rows_removed: int = 0
346
390
  orphan_rows_nullified: int = 0
347
391
  fkeys_pruned: int = 0
348
392
 
349
393
  def to_dict(self) -> dict[str, Any]:
350
- return {
394
+ result = {
351
395
  "source_hostname": self.source_hostname,
352
396
  "source_catalog_id": self.source_catalog_id,
353
397
  "source_snapshot": self.source_snapshot,
@@ -359,6 +403,10 @@ class CloneDetails:
359
403
  "asset_mode": self.asset_mode,
360
404
  "exclude_schemas": self.exclude_schemas,
361
405
  "exclude_objects": self.exclude_objects,
406
+ "add_ml_schema": self.add_ml_schema,
407
+ "copy_annotations": self.copy_annotations,
408
+ "copy_policy": self.copy_policy,
409
+ "reinitialize_dataset_versions": self.reinitialize_dataset_versions,
362
410
  "rows_copied": self.rows_copied,
363
411
  "rows_skipped": self.rows_skipped,
364
412
  "truncated_count": self.truncated_count,
@@ -366,6 +414,9 @@ class CloneDetails:
366
414
  "orphan_rows_nullified": self.orphan_rows_nullified,
367
415
  "fkeys_pruned": self.fkeys_pruned,
368
416
  }
417
+ if self.skipped_rids:
418
+ result["skipped_rids"] = self.skipped_rids
419
+ return result
369
420
 
370
421
  @classmethod
371
422
  def from_dict(cls, data: dict[str, Any]) -> "CloneDetails":
@@ -381,8 +432,13 @@ class CloneDetails:
381
432
  asset_mode=data.get("asset_mode", "refs"),
382
433
  exclude_schemas=data.get("exclude_schemas", []),
383
434
  exclude_objects=data.get("exclude_objects", []),
435
+ add_ml_schema=data.get("add_ml_schema", False),
436
+ copy_annotations=data.get("copy_annotations", True),
437
+ copy_policy=data.get("copy_policy", True),
438
+ reinitialize_dataset_versions=data.get("reinitialize_dataset_versions", True),
384
439
  rows_copied=data.get("rows_copied", 0),
385
440
  rows_skipped=data.get("rows_skipped", 0),
441
+ skipped_rids=data.get("skipped_rids", []),
386
442
  truncated_count=data.get("truncated_count", 0),
387
443
  orphan_rows_removed=data.get("orphan_rows_removed", 0),
388
444
  orphan_rows_nullified=data.get("orphan_rows_nullified", 0),
@@ -677,7 +733,7 @@ def _copy_table_data_with_retry(
677
733
  report: "CloneReport",
678
734
  deferred_indexes: dict[str, list[dict]],
679
735
  truncate_oversized: bool = False,
680
- ) -> tuple[int, int, list[TruncatedValue]]:
736
+ ) -> tuple[int, int, list[str], list[TruncatedValue]]:
681
737
  """Copy data for a single table with retry logic for index errors.
682
738
 
683
739
  If a btree index size error occurs, this function will:
@@ -698,7 +754,7 @@ def _copy_table_data_with_retry(
698
754
  truncate_oversized: If True, truncate oversized values instead of skipping rows.
699
755
 
700
756
  Returns:
701
- Tuple of (rows_copied, rows_skipped, truncated_values).
757
+ Tuple of (rows_copied, rows_skipped, skipped_rids, truncated_values).
702
758
  rows_copied is -1 if the copy failed entirely.
703
759
  """
704
760
  tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
@@ -711,6 +767,7 @@ def _copy_table_data_with_retry(
711
767
  last = None
712
768
  table_rows = 0
713
769
  rows_skipped = 0
770
+ skipped_rids: list[str] = [] # Track RIDs of skipped rows
714
771
  truncated_values: list[TruncatedValue] = []
715
772
  row_by_row_mode = False
716
773
  problematic_index = None
@@ -768,7 +825,7 @@ def _copy_table_data_with_retry(
768
825
  ).json()
769
826
  except Exception as e:
770
827
  logger.warning(f"Failed to read from {sname}:{tname}: {e}")
771
- return -1, rows_skipped, truncated_values
828
+ return -1, rows_skipped, skipped_rids, truncated_values
772
829
 
773
830
  if not page:
774
831
  break
@@ -809,11 +866,14 @@ def _copy_table_data_with_retry(
809
866
 
810
867
  rows_skipped += 1
811
868
  rid = row.get('RID', 'unknown')
869
+ skipped_rids.append(rid)
812
870
  logger.debug(f"Skipping row {rid} in {table_key} due to index size limit")
813
871
  else:
814
872
  # Different error - log and skip
815
873
  rows_skipped += 1
816
- logger.debug(f"Skipping row in {table_key}: {row_error}")
874
+ rid = row.get('RID', 'unknown')
875
+ skipped_rids.append(rid)
876
+ logger.debug(f"Skipping row {rid} in {table_key}: {row_error}")
817
877
  last = page[-1]['RID']
818
878
  else:
819
879
  # Normal batch mode
@@ -884,14 +944,17 @@ def _copy_table_data_with_retry(
884
944
 
885
945
  rows_skipped += 1
886
946
  rid = row.get('RID', 'unknown')
947
+ skipped_rids.append(rid)
887
948
  logger.debug(f"Skipping row {rid} due to index size limit")
888
949
  else:
889
950
  rows_skipped += 1
890
- logger.debug(f"Skipping row: {row_error}")
951
+ rid = row.get('RID', 'unknown')
952
+ skipped_rids.append(rid)
953
+ logger.debug(f"Skipping row {rid}: {row_error}")
891
954
  last = page[-1]['RID']
892
955
  else:
893
956
  logger.warning(f"Failed to write to {sname}:{tname}: {e}")
894
- return -1, rows_skipped, truncated_values
957
+ return -1, rows_skipped, skipped_rids, truncated_values
895
958
 
896
959
  # Report skipped rows
897
960
  if rows_skipped > 0:
@@ -903,8 +966,9 @@ def _copy_table_data_with_retry(
903
966
  details=f"Index '{problematic_index}' on column '{problematic_column}'",
904
967
  action="These rows have values too large for btree index (>2704 bytes)",
905
968
  row_count=rows_skipped,
969
+ skipped_rids=skipped_rids if skipped_rids else None,
906
970
  ))
907
- logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits")
971
+ logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits: RIDs={skipped_rids}")
908
972
 
909
973
  # Report truncated values
910
974
  if truncated_values:
@@ -919,7 +983,7 @@ def _copy_table_data_with_retry(
919
983
  ))
920
984
  logger.info(f"Truncated {len(truncated_values)} values in {table_key}")
921
985
 
922
- return table_rows, rows_skipped, truncated_values
986
+ return table_rows, rows_skipped, skipped_rids, truncated_values
923
987
 
924
988
 
925
989
 
@@ -946,6 +1010,925 @@ def _rebuild_deferred_indexes(
946
1010
  logger.info(f"Reporting {sum(len(v) for v in deferred_indexes.values())} index issues...")
947
1011
 
948
1012
 
1013
+ # =============================================================================
1014
+ # Subset Clone Helpers
1015
+ # =============================================================================
1016
+
1017
+
1018
+ # Export annotation tag
1019
+ _export_tag = "tag:isrd.isi.edu,2019:export"
1020
+
1021
+
1022
+ def _parse_export_annotation_tables(
1023
+ table: Any,
1024
+ paths_discovered: list[list[str]] | None = None,
1025
+ ) -> tuple[list[str], list[list[str]]]:
1026
+ """Parse export annotation from a table to extract tables and paths.
1027
+
1028
+ The export annotation (tag:isrd.isi.edu,2019:export) defines which tables
1029
+ should be exported when downloading a row as a BDBag. This function extracts
1030
+ the table names from the annotation paths.
1031
+
1032
+ Args:
1033
+ table: ERMrest Table object with annotations.
1034
+ paths_discovered: Optional list to append discovered paths to (for reuse).
1035
+
1036
+ Returns:
1037
+ Tuple of (tables_list, paths_list) where:
1038
+ - tables_list: List of table names in "schema:table" format
1039
+ - paths_list: List of paths, each path is a list of "schema:table" strings
1040
+ """
1041
+ if paths_discovered is None:
1042
+ paths_discovered = []
1043
+
1044
+ tables: set[str] = set()
1045
+
1046
+ # Add the root table itself
1047
+ root_table_spec = f"{table.schema.name}:{table.name}"
1048
+ tables.add(root_table_spec)
1049
+
1050
+ # Get the export annotation
1051
+ export_annotation = table.annotations.get(_export_tag, {})
1052
+
1053
+ # Export annotations can have multiple contexts (*, detailed, etc.)
1054
+ # We'll look at all of them
1055
+ for context_key, context_value in export_annotation.items():
1056
+ templates = context_value.get("templates", [])
1057
+ for template in templates:
1058
+ outputs = template.get("outputs", [])
1059
+ for output in outputs:
1060
+ source = output.get("source", {})
1061
+ path_str = source.get("path", "")
1062
+
1063
+ if not path_str:
1064
+ continue
1065
+
1066
+ # Parse the path - it's in ERMrest format like "schema:table/schema:table2/..."
1067
+ # Split by "/" and parse each segment
1068
+ path_segments = path_str.split("/")
1069
+ current_path: list[str] = [root_table_spec]
1070
+
1071
+ for segment in path_segments:
1072
+ # Skip empty segments
1073
+ if not segment:
1074
+ continue
1075
+
1076
+ # Skip attribute projections (contain ":" followed by "=")
1077
+ if "=" in segment:
1078
+ continue
1079
+
1080
+ # Parse schema:table format
1081
+ if ":" in segment:
1082
+ # Could be "schema:table" or complex path syntax
1083
+ # For simple schema:table, just add it
1084
+ parts = segment.split(":")
1085
+ if len(parts) == 2 and not any(c in segment for c in ["(", ")", "!", "@"]):
1086
+ schema, tname = parts
1087
+ table_spec = f"{schema}:{tname}"
1088
+ tables.add(table_spec)
1089
+ current_path.append(table_spec)
1090
+
1091
+ if len(current_path) > 1:
1092
+ paths_discovered.append(current_path)
1093
+
1094
+ return sorted(tables), paths_discovered
1095
+
1096
+
1097
+ def _compute_reachable_rids_from_paths(
1098
+ catalog: ErmrestCatalog,
1099
+ root_rid: str,
1100
+ root_table: str,
1101
+ paths: list[list[str]],
1102
+ include_tables: list[str],
1103
+ model: Any | None = None,
1104
+ ) -> dict[str, set[str]]:
1105
+ """Compute RIDs reachable from root_rid using predefined paths.
1106
+
1107
+ This is more efficient than FK graph traversal because it uses the paths
1108
+ defined in the export annotation, which are already known to work.
1109
+
1110
+ After following the paths, also discovers FK references from reachable rows
1111
+ back to tables in the include list. This ensures FK integrity by including
1112
+ referenced rows that weren't found via the export paths.
1113
+
1114
+ Args:
1115
+ catalog: Source catalog connection.
1116
+ root_rid: Starting RID.
1117
+ root_table: Root table in "schema:table" format.
1118
+ paths: List of paths from export annotation, each path is a list of
1119
+ "schema:table" strings starting with the root table.
1120
+ include_tables: All tables to track reachability for.
1121
+ model: Optional ERMrest Model for FK relationship discovery.
1122
+
1123
+ Returns:
1124
+ Dict mapping "schema:table" -> set of reachable RIDs.
1125
+ """
1126
+ # Initialize reachable sets for all tables
1127
+ reachable: dict[str, set[str]] = {t: set() for t in include_tables}
1128
+ reachable[root_table].add(root_rid)
1129
+
1130
+ # Query each path from the export annotation
1131
+ for path in paths:
1132
+ if len(path) < 2:
1133
+ continue
1134
+
1135
+ # Build ERMrest query following the path
1136
+ # Start with the root table and RID filter
1137
+ query = f"/entity/{_quote_table_spec(root_table)}/RID={urlquote(root_rid)}"
1138
+
1139
+ # Add each step in the path (skip the root table)
1140
+ for table_spec in path[1:]:
1141
+ query += f"/{_quote_table_spec(table_spec)}"
1142
+
1143
+ # Query for rows at the end of the path
1144
+ target_table = path[-1]
1145
+ if target_table not in reachable:
1146
+ continue
1147
+
1148
+ try:
1149
+ result = catalog.get(query).json()
1150
+ for row in result:
1151
+ if "RID" in row:
1152
+ reachable[target_table].add(row["RID"])
1153
+ if result:
1154
+ logger.debug(f"Path {' -> '.join(path)}: found {len(result)} rows")
1155
+ except Exception as e:
1156
+ logger.debug(f"Path query failed: {query}: {e}")
1157
+ continue
1158
+
1159
+ # Note: FK reference expansion was too slow for large datasets and is disabled.
1160
+ # Instead, rely on orphan_strategy (DELETE/NULLIFY) to handle any FK violations
1161
+ # that occur when referenced rows weren't found via the export paths.
1162
+
1163
+ return reachable
1164
+
1165
+
1166
+ def _expand_reachable_via_fk_references(
1167
+ catalog: ErmrestCatalog,
1168
+ reachable: dict[str, set[str]],
1169
+ include_tables: list[str],
1170
+ model: Any,
1171
+ ) -> None:
1172
+ """Expand reachable RIDs by following FK references.
1173
+
1174
+ For each table with reachable rows, find FK columns that reference other
1175
+ included tables and add the referenced RIDs to the reachable set.
1176
+
1177
+ Args:
1178
+ catalog: Source catalog connection.
1179
+ reachable: Dict mapping "schema:table" -> set of RIDs (modified in place).
1180
+ include_tables: Tables to include.
1181
+ model: ERMrest Model object.
1182
+ """
1183
+ # Build table lookup
1184
+ table_lookup: dict[tuple[str, str], str] = {}
1185
+ for table_spec in include_tables:
1186
+ schema, table_name = table_spec.split(":", 1)
1187
+ table_lookup[(schema, table_name)] = table_spec
1188
+
1189
+ # Iterate until no new RIDs are discovered
1190
+ max_iterations = 10 # Prevent infinite loops
1191
+ iteration = 0
1192
+
1193
+ while iteration < max_iterations:
1194
+ iteration += 1
1195
+ new_rids_found = False
1196
+
1197
+ for table_spec in include_tables:
1198
+ current_rids = reachable.get(table_spec, set())
1199
+ if not current_rids:
1200
+ continue
1201
+
1202
+ schema, table_name = table_spec.split(":", 1)
1203
+ try:
1204
+ table = model.schemas[schema].tables[table_name]
1205
+ except KeyError:
1206
+ continue
1207
+
1208
+ # Check each FK for references to other included tables
1209
+ for fk in table.foreign_keys:
1210
+ pk_table = fk.pk_table
1211
+ pk_key = (pk_table.schema.name, pk_table.name)
1212
+ pk_spec = table_lookup.get(pk_key)
1213
+
1214
+ if not pk_spec:
1215
+ continue # Target table not in our include list
1216
+
1217
+ # Get the FK column name
1218
+ if not fk.foreign_key_columns:
1219
+ continue
1220
+ fk_col = fk.foreign_key_columns[0].name
1221
+
1222
+ # Query for FK values from reachable rows
1223
+ # Do this in batches to avoid URL length limits
1224
+ # Ensure all RIDs are strings
1225
+ rids_list = [str(r) for r in current_rids if r is not None]
1226
+ batch_size = 100
1227
+
1228
+ for i in range(0, len(rids_list), batch_size):
1229
+ batch = rids_list[i:i + batch_size]
1230
+ rid_filter = ",".join(urlquote(r) for r in batch)
1231
+
1232
+ try:
1233
+ # Get distinct FK values
1234
+ query = f"/attributegroup/{_quote_table_spec(table_spec)}/RID=any({rid_filter})/{urlquote(fk_col)}"
1235
+ result = catalog.get(query).json()
1236
+
1237
+ for row in result:
1238
+ fk_value = row.get(fk_col)
1239
+ if fk_value is not None:
1240
+ # Ensure FK value is a string
1241
+ fk_value_str = str(fk_value)
1242
+ if fk_value_str not in reachable[pk_spec]:
1243
+ reachable[pk_spec].add(fk_value_str)
1244
+ new_rids_found = True
1245
+ except Exception as e:
1246
+ logger.debug(f"FK reference query failed: {e}")
1247
+ continue
1248
+
1249
+ if not new_rids_found:
1250
+ break
1251
+
1252
+ if iteration > 1:
1253
+ logger.debug(f"FK reference expansion completed in {iteration} iterations")
1254
+
1255
+
1256
+ def _expand_tables_with_associations(
1257
+ model: Any,
1258
+ include_tables: list[str],
1259
+ ) -> tuple[list[str], list[str]]:
1260
+ """Expand table list to include association tables needed for FK integrity.
1261
+
1262
+ Given a list of tables, finds all association tables that connect pairs
1263
+ of included tables and adds them to the list.
1264
+
1265
+ Args:
1266
+ model: ERMrest Model object.
1267
+ include_tables: List of table names in "schema:table" format.
1268
+
1269
+ Returns:
1270
+ Tuple of (all_tables, association_tables_added) where:
1271
+ - all_tables: Original tables plus added association tables
1272
+ - association_tables_added: Just the association tables that were added
1273
+ """
1274
+ # Parse table names to (schema, table) tuples
1275
+ included_set: set[tuple[str, str]] = set()
1276
+ for table_spec in include_tables:
1277
+ if ":" in table_spec:
1278
+ schema, table = table_spec.split(":", 1)
1279
+ included_set.add((schema, table))
1280
+ else:
1281
+ raise ValueError(f"Table must be specified as 'schema:table', got: {table_spec}")
1282
+
1283
+ # Find association tables connecting included tables
1284
+ associations_added: list[str] = []
1285
+
1286
+ for schema_name, table_name in list(included_set):
1287
+ try:
1288
+ table = model.schemas[schema_name].tables[table_name]
1289
+ except KeyError:
1290
+ continue
1291
+
1292
+ # Check for associations from this table
1293
+ for assoc in table.find_associations(pure=False):
1294
+ assoc_table = assoc.table
1295
+ assoc_key = (assoc_table.schema.name, assoc_table.name)
1296
+
1297
+ # Already included
1298
+ if assoc_key in included_set:
1299
+ continue
1300
+
1301
+ # Check if the other end of the association is in our included set
1302
+ for other_fk in assoc.other_fkeys:
1303
+ other_table = other_fk.pk_table
1304
+ other_key = (other_table.schema.name, other_table.name)
1305
+
1306
+ if other_key in included_set:
1307
+ # This association connects two included tables
1308
+ included_set.add(assoc_key)
1309
+ assoc_spec = f"{assoc_key[0]}:{assoc_key[1]}"
1310
+ if assoc_spec not in associations_added:
1311
+ associations_added.append(assoc_spec)
1312
+ break
1313
+
1314
+ all_tables = list(include_tables) + associations_added
1315
+ return all_tables, associations_added
1316
+
1317
+
1318
+ def _expand_tables_with_vocabularies(
1319
+ model: Any,
1320
+ include_tables: list[str],
1321
+ ) -> tuple[list[str], list[str]]:
1322
+ """Expand table list to include vocabulary tables referenced by included tables.
1323
+
1324
+ Examines FK targets of included tables and adds any that are vocabulary tables.
1325
+
1326
+ Args:
1327
+ model: ERMrest Model object.
1328
+ include_tables: List of table names in "schema:table" format.
1329
+
1330
+ Returns:
1331
+ Tuple of (all_tables, vocabulary_tables_added) where:
1332
+ - all_tables: Original tables plus added vocabulary tables
1333
+ - vocabulary_tables_added: Just the vocabulary tables that were added
1334
+ """
1335
+ def is_vocabulary(table) -> bool:
1336
+ return VOCAB_COLUMNS.issubset({c.name.upper() for c in table.columns})
1337
+
1338
+ # Parse table names
1339
+ included_set: set[tuple[str, str]] = set()
1340
+ for table_spec in include_tables:
1341
+ if ":" in table_spec:
1342
+ schema, table = table_spec.split(":", 1)
1343
+ included_set.add((schema, table))
1344
+
1345
+ vocabularies_added: list[str] = []
1346
+
1347
+ for schema_name, table_name in list(included_set):
1348
+ try:
1349
+ table = model.schemas[schema_name].tables[table_name]
1350
+ except KeyError:
1351
+ continue
1352
+
1353
+ # Check FK targets for vocabulary tables
1354
+ for fk in table.foreign_keys:
1355
+ pk_table = fk.pk_table
1356
+ pk_key = (pk_table.schema.name, pk_table.name)
1357
+
1358
+ if pk_key in included_set:
1359
+ continue
1360
+
1361
+ if is_vocabulary(pk_table):
1362
+ included_set.add(pk_key)
1363
+ vocab_spec = f"{pk_key[0]}:{pk_key[1]}"
1364
+ if vocab_spec not in vocabularies_added:
1365
+ vocabularies_added.append(vocab_spec)
1366
+
1367
+ all_tables = list(include_tables) + vocabularies_added
1368
+ return all_tables, vocabularies_added
1369
+
1370
+
1371
+ def _quote_table_spec(table_spec: str) -> str:
1372
+ """URL-quote a table specification for ERMrest queries.
1373
+
1374
+ ERMrest uses schema:table format where the colon must NOT be encoded.
1375
+ This function quotes the schema and table names separately.
1376
+
1377
+ Args:
1378
+ table_spec: Table specification in "schema:table" format.
1379
+
1380
+ Returns:
1381
+ URL-safe string with schema and table quoted but colon preserved.
1382
+ """
1383
+ schema, table = table_spec.split(":", 1)
1384
+ return f"{urlquote(schema)}:{urlquote(table)}"
1385
+
1386
+
1387
+ def _discover_reachable_tables(
1388
+ model: Any,
1389
+ start_tables: list[str],
1390
+ exclude_tables: set[tuple[str, str]] | None = None,
1391
+ exclude_schemas: set[str] | None = None,
1392
+ ) -> list[str]:
1393
+ """Discover all tables reachable from start tables via FK relationships.
1394
+
1395
+ Traverses FK graph in both directions (outbound and inbound FKs) to find
1396
+ all connected tables, excluding system schemas and specified exclusions.
1397
+
1398
+ Args:
1399
+ model: ERMrest Model object.
1400
+ start_tables: Starting tables in "schema:table" format.
1401
+ exclude_tables: Set of (schema, table) tuples to exclude from discovery.
1402
+ exclude_schemas: Set of schema names to exclude entirely.
1403
+
1404
+ Returns:
1405
+ List of reachable table names in "schema:table" format.
1406
+ """
1407
+ exclude_tables = exclude_tables or set()
1408
+ exclude_schemas = exclude_schemas or set()
1409
+
1410
+ # System schemas to always exclude
1411
+ system_schemas = {"public", "_acl_admin", "WWW"}
1412
+ all_excluded_schemas = system_schemas | exclude_schemas
1413
+
1414
+ # Parse start tables
1415
+ discovered: set[tuple[str, str]] = set()
1416
+ to_visit: list[tuple[str, str]] = []
1417
+
1418
+ for table_spec in start_tables:
1419
+ if ":" not in table_spec:
1420
+ raise ValueError(f"Table must be specified as 'schema:table', got: {table_spec}")
1421
+ schema, table = table_spec.split(":", 1)
1422
+ key = (schema, table)
1423
+ if key not in exclude_tables and schema not in all_excluded_schemas:
1424
+ discovered.add(key)
1425
+ to_visit.append(key)
1426
+
1427
+ # BFS traversal of FK graph
1428
+ while to_visit:
1429
+ current_key = to_visit.pop(0)
1430
+ schema_name, table_name = current_key
1431
+
1432
+ try:
1433
+ table = model.schemas[schema_name].tables[table_name]
1434
+ except KeyError:
1435
+ continue
1436
+
1437
+ # Find connected tables via outbound FKs (this table references other tables)
1438
+ for fk in table.foreign_keys:
1439
+ pk_table = fk.pk_table
1440
+ pk_key = (pk_table.schema.name, pk_table.name)
1441
+
1442
+ if pk_key in discovered or pk_key in exclude_tables:
1443
+ continue
1444
+ if pk_table.schema.name in all_excluded_schemas:
1445
+ continue
1446
+
1447
+ discovered.add(pk_key)
1448
+ to_visit.append(pk_key)
1449
+
1450
+ # Find connected tables via inbound FKs (other tables reference this table)
1451
+ for fk in table.referenced_by:
1452
+ ref_table = fk.table
1453
+ ref_key = (ref_table.schema.name, ref_table.name)
1454
+
1455
+ if ref_key in discovered or ref_key in exclude_tables:
1456
+ continue
1457
+ if ref_table.schema.name in all_excluded_schemas:
1458
+ continue
1459
+
1460
+ discovered.add(ref_key)
1461
+ to_visit.append(ref_key)
1462
+
1463
+ # Convert to schema:table format
1464
+ return [f"{schema}:{table}" for schema, table in sorted(discovered)]
1465
+
1466
+
1467
+ def _build_path_query(
1468
+ root_table: str,
1469
+ root_rid: str,
1470
+ path: list[tuple[str, str]],
1471
+ ) -> str:
1472
+ """Build an ERMrest path query to traverse FK relationships.
1473
+
1474
+ Args:
1475
+ root_table: Starting table in "schema:table" format.
1476
+ root_rid: RID of the starting row.
1477
+ path: List of (schema, table) tuples representing the path.
1478
+
1479
+ Returns:
1480
+ ERMrest query string like "/entity/Schema:Table/RID=X/Schema:Next/..."
1481
+ """
1482
+ query = f"/entity/{_quote_table_spec(root_table)}/RID={urlquote(root_rid)}"
1483
+ for schema, table in path:
1484
+ query += f"/{urlquote(schema)}:{urlquote(table)}"
1485
+ return query
1486
+
1487
+
1488
+ def _compute_reachable_rids(
1489
+ catalog: ErmrestCatalog,
1490
+ root_rid: str,
1491
+ include_tables: list[str],
1492
+ model: Any,
1493
+ ) -> dict[str, set[str]]:
1494
+ """Compute RIDs reachable from root_rid for each included table.
1495
+
1496
+ Uses FK graph traversal (both directions) to find all rows that are
1497
+ connected to the root row through FK relationships.
1498
+
1499
+ Args:
1500
+ catalog: Source catalog connection.
1501
+ root_rid: Starting RID.
1502
+ include_tables: Tables to compute reachability for ("schema:table" format).
1503
+ model: ERMrest Model object.
1504
+
1505
+ Returns:
1506
+ Dict mapping "schema:table" -> set of reachable RIDs.
1507
+ """
1508
+ # First, resolve the root RID to find its table
1509
+ root_table_key = None
1510
+ root_table = None
1511
+
1512
+ for table_spec in include_tables:
1513
+ schema, table_name = table_spec.split(":", 1)
1514
+ try:
1515
+ uri = f"/entity/{_quote_table_spec(table_spec)}/RID={urlquote(root_rid)}"
1516
+ result = catalog.get(uri).json()
1517
+ if result:
1518
+ root_table_key = table_spec
1519
+ root_table = model.schemas[schema].tables[table_name]
1520
+ break
1521
+ except Exception:
1522
+ continue
1523
+
1524
+ if root_table_key is None:
1525
+ raise ValueError(f"Root RID {root_rid} not found in any of the included tables")
1526
+
1527
+ # Initialize reachable sets
1528
+ reachable: dict[str, set[str]] = {t: set() for t in include_tables}
1529
+ reachable[root_table_key].add(root_rid)
1530
+
1531
+ # Parse include_tables to lookup dict
1532
+ table_lookup: dict[tuple[str, str], str] = {}
1533
+ for table_spec in include_tables:
1534
+ schema, table_name = table_spec.split(":", 1)
1535
+ table_lookup[(schema, table_name)] = table_spec
1536
+
1537
+ # Build paths from root table using FK traversal (both directions)
1538
+ def find_paths(
1539
+ start_table: Any,
1540
+ visited: set[tuple[str, str]],
1541
+ current_path: list[tuple[str, str]],
1542
+ ) -> list[list[tuple[str, str]]]:
1543
+ """Recursively find all FK paths from start_table to included tables."""
1544
+ paths = []
1545
+
1546
+ # Get all connected tables (both FK directions)
1547
+ connected = []
1548
+
1549
+ # Outbound FKs (this table references other tables)
1550
+ for fk in start_table.foreign_keys:
1551
+ pk_table = fk.pk_table
1552
+ pk_key = (pk_table.schema.name, pk_table.name)
1553
+ if pk_key not in visited and pk_key in table_lookup:
1554
+ connected.append(pk_table)
1555
+
1556
+ # Inbound FKs (other tables reference this table)
1557
+ for fk in start_table.referenced_by:
1558
+ ref_table = fk.table
1559
+ ref_key = (ref_table.schema.name, ref_table.name)
1560
+ if ref_key not in visited and ref_key in table_lookup:
1561
+ connected.append(ref_table)
1562
+
1563
+ for next_table in connected:
1564
+ next_key = (next_table.schema.name, next_table.name)
1565
+ new_path = current_path + [next_key]
1566
+
1567
+ # This path reaches the target table
1568
+ paths.append(new_path)
1569
+
1570
+ # Continue exploring from this table
1571
+ new_visited = visited | {next_key}
1572
+ paths.extend(find_paths(next_table, new_visited, new_path))
1573
+
1574
+ return paths
1575
+
1576
+ # Find all paths from root table
1577
+ root_key = (root_table.schema.name, root_table.name)
1578
+ all_paths = find_paths(root_table, {root_key}, [])
1579
+
1580
+ # For each path, query for reachable rows
1581
+ for path in all_paths:
1582
+ if not path:
1583
+ continue
1584
+
1585
+ target_key = path[-1]
1586
+ target_spec = table_lookup.get(target_key)
1587
+ if not target_spec:
1588
+ continue
1589
+
1590
+ # Build and execute the path query
1591
+ query = _build_path_query(root_table_key, root_rid, path)
1592
+ try:
1593
+ result = catalog.get(query).json()
1594
+ for row in result:
1595
+ if "RID" in row:
1596
+ reachable[target_spec].add(row["RID"])
1597
+ except Exception as e:
1598
+ logger.debug(f"Path query failed: {query}: {e}")
1599
+ continue
1600
+
1601
+ # Also need to check if reachable rows reference other reachable rows
1602
+ # This handles transitive reachability through association tables
1603
+ changed = True
1604
+ while changed:
1605
+ changed = False
1606
+ for table_spec in include_tables:
1607
+ schema, table_name = table_spec.split(":", 1)
1608
+ try:
1609
+ table = model.schemas[schema].tables[table_name]
1610
+ except KeyError:
1611
+ continue
1612
+
1613
+ current_rids = reachable[table_spec].copy()
1614
+
1615
+ # Check FKs from this table
1616
+ for fk in table.foreign_keys:
1617
+ pk_table = fk.pk_table
1618
+ pk_spec = f"{pk_table.schema.name}:{pk_table.name}"
1619
+ if pk_spec not in include_tables:
1620
+ continue
1621
+
1622
+ fk_col = fk.foreign_key_columns[0].name
1623
+
1624
+ # For each reachable row in this table, find the referenced row
1625
+ for rid in current_rids:
1626
+ try:
1627
+ row = catalog.get(f"/entity/{_quote_table_spec(table_spec)}/RID={urlquote(rid)}").json()
1628
+ if row and fk_col in row[0] and row[0][fk_col]:
1629
+ ref_rid = row[0][fk_col]
1630
+ if ref_rid not in reachable[pk_spec]:
1631
+ reachable[pk_spec].add(ref_rid)
1632
+ changed = True
1633
+ except Exception:
1634
+ continue
1635
+
1636
+ # Check FKs to this table (inbound)
1637
+ for fk in table.referenced_by:
1638
+ ref_table = fk.table
1639
+ ref_spec = f"{ref_table.schema.name}:{ref_table.name}"
1640
+ if ref_spec not in include_tables:
1641
+ continue
1642
+
1643
+ fk_col = fk.foreign_key_columns[0].name
1644
+
1645
+ # For each reachable row in this table, find rows that reference it
1646
+ for rid in current_rids:
1647
+ try:
1648
+ result = catalog.get(
1649
+ f"/entity/{_quote_table_spec(ref_spec)}/{urlquote(fk_col)}={urlquote(rid)}"
1650
+ ).json()
1651
+ for row in result:
1652
+ if "RID" in row and row["RID"] not in reachable[ref_spec]:
1653
+ reachable[ref_spec].add(row["RID"])
1654
+ changed = True
1655
+ except Exception:
1656
+ continue
1657
+
1658
+ return reachable
1659
+
1660
+
1661
+ def _copy_data_via_export_paths(
1662
+ src_catalog: ErmrestCatalog,
1663
+ dst_catalog: ErmrestCatalog,
1664
+ root_table: str,
1665
+ root_rid: str,
1666
+ export_paths: list[list[str]],
1667
+ all_tables: list[str],
1668
+ report: "CloneReport",
1669
+ truncate_oversized: bool = False,
1670
+ page_size: int = 1000,
1671
+ ) -> dict[str, int]:
1672
+ """Copy data using export paths to respect row-level security.
1673
+
1674
+ Instead of computing reachable RIDs and fetching them individually (which can
1675
+ fail due to row-level ACLs), this function copies data by following the export
1676
+ paths directly. This ensures we only copy rows that are actually visible.
1677
+
1678
+ Args:
1679
+ src_catalog: Source catalog connection.
1680
+ dst_catalog: Destination catalog connection.
1681
+ root_table: Root table in "schema:table" format.
1682
+ root_rid: Root RID to start from.
1683
+ export_paths: Paths from export annotation.
1684
+ all_tables: All tables to copy (for vocabulary tables not in paths).
1685
+ report: Clone report for recording issues.
1686
+ truncate_oversized: Whether to truncate oversized values.
1687
+ page_size: Number of rows per batch.
1688
+
1689
+ Returns:
1690
+ Dict mapping table spec -> rows copied.
1691
+ """
1692
+ MAX_INDEX_VALUE_BYTES = 2600
1693
+ TRUNCATE_SUFFIX = "...[TRUNCATED]"
1694
+ rows_by_table: dict[str, int] = {}
1695
+
1696
+ def truncate_row(row: dict) -> tuple[dict, list[TruncatedValue]]:
1697
+ """Truncate oversized values in a row."""
1698
+ truncated_values = []
1699
+ modified = row.copy()
1700
+ for col, value in row.items():
1701
+ if isinstance(value, str):
1702
+ value_bytes = len(value.encode('utf-8'))
1703
+ if value_bytes > MAX_INDEX_VALUE_BYTES:
1704
+ max_chars = MAX_INDEX_VALUE_BYTES - len(TRUNCATE_SUFFIX.encode('utf-8'))
1705
+ truncated = value[:max_chars] + TRUNCATE_SUFFIX
1706
+ while len(truncated.encode('utf-8')) > MAX_INDEX_VALUE_BYTES:
1707
+ max_chars -= 100
1708
+ truncated = value[:max_chars] + TRUNCATE_SUFFIX
1709
+ modified[col] = truncated
1710
+ truncated_values.append(TruncatedValue(
1711
+ table="",
1712
+ rid=str(row.get('RID', 'unknown')),
1713
+ column=col,
1714
+ original_bytes=value_bytes,
1715
+ truncated_bytes=len(truncated.encode('utf-8')),
1716
+ ))
1717
+ return modified, truncated_values
1718
+
1719
+ def copy_with_path(path_query: str, table_spec: str) -> int:
1720
+ """Copy data using a path query, returning rows copied."""
1721
+ sname, tname = table_spec.split(":", 1)
1722
+ tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
1723
+ rows_copied = 0
1724
+ last_rid = ""
1725
+
1726
+ while True:
1727
+ # Add pagination
1728
+ if last_rid:
1729
+ query = f"{path_query}@sort(RID)@after({urlquote(last_rid)})?limit={page_size}"
1730
+ else:
1731
+ query = f"{path_query}@sort(RID)?limit={page_size}"
1732
+
1733
+ try:
1734
+ page = src_catalog.get(query).json()
1735
+ except Exception as e:
1736
+ logger.warning(f"Path query failed {path_query}: {e}")
1737
+ break
1738
+
1739
+ if not page:
1740
+ break
1741
+
1742
+ # Process rows
1743
+ rows_to_insert = []
1744
+ for row in page:
1745
+ if truncate_oversized:
1746
+ modified, _ = truncate_row(row)
1747
+ rows_to_insert.append(modified)
1748
+ else:
1749
+ rows_to_insert.append(row)
1750
+
1751
+ # Insert
1752
+ try:
1753
+ dst_catalog.post(f"/entity/{tname_uri}", json=rows_to_insert)
1754
+ rows_copied += len(rows_to_insert)
1755
+ except Exception as e:
1756
+ # Try row-by-row on failure
1757
+ for row in rows_to_insert:
1758
+ try:
1759
+ dst_catalog.post(f"/entity/{tname_uri}", json=[row])
1760
+ rows_copied += 1
1761
+ except Exception:
1762
+ logger.debug(f"Failed to insert row: {e}")
1763
+
1764
+ if len(page) < page_size:
1765
+ break
1766
+ last_rid = page[-1].get("RID", "")
1767
+ if not last_rid:
1768
+ break
1769
+
1770
+ return rows_copied
1771
+
1772
+ # Copy root table (just the root row)
1773
+ root_sname, root_tname = root_table.split(":", 1)
1774
+ root_uri = f"{urlquote(root_sname)}:{urlquote(root_tname)}"
1775
+ try:
1776
+ root_row = src_catalog.get(f"/entity/{root_uri}/RID={urlquote(root_rid)}").json()
1777
+ if root_row:
1778
+ dst_catalog.post(f"/entity/{root_uri}", json=root_row)
1779
+ rows_by_table[root_table] = 1
1780
+ logger.info(f"Copied 1 row for {root_table}")
1781
+ except Exception as e:
1782
+ logger.warning(f"Failed to copy root row: {e}")
1783
+
1784
+ # Copy data for each export path
1785
+ tables_copied = {root_table}
1786
+ for path in export_paths:
1787
+ if len(path) < 2:
1788
+ continue
1789
+
1790
+ # Build the path query starting from root
1791
+ query = f"/entity/{_quote_table_spec(root_table)}/RID={urlquote(root_rid)}"
1792
+ for table_spec in path[1:]:
1793
+ query += f"/{_quote_table_spec(table_spec)}"
1794
+
1795
+ target_table = path[-1]
1796
+ if target_table in tables_copied:
1797
+ continue
1798
+
1799
+ rows = copy_with_path(query, target_table)
1800
+ rows_by_table[target_table] = rows_by_table.get(target_table, 0) + rows
1801
+ tables_copied.add(target_table)
1802
+ logger.info(f"Copied {rows} rows for {target_table}")
1803
+
1804
+ # Copy vocabulary tables (full copy since they're not in paths)
1805
+ for table_spec in all_tables:
1806
+ if table_spec in tables_copied:
1807
+ continue
1808
+
1809
+ # Check if it's a vocabulary table
1810
+ sname, tname = table_spec.split(":", 1)
1811
+ if sname.startswith("vocab") or "vocab" in sname.lower():
1812
+ # Full copy of vocabulary table
1813
+ query = f"/entity/{_quote_table_spec(table_spec)}"
1814
+ rows = copy_with_path(query, table_spec)
1815
+ rows_by_table[table_spec] = rows
1816
+ tables_copied.add(table_spec)
1817
+ logger.info(f"Copied {rows} rows for vocabulary table {table_spec}")
1818
+
1819
+ return rows_by_table
1820
+
1821
+
1822
+ def _copy_subset_table_data(
1823
+ src_catalog: ErmrestCatalog,
1824
+ dst_catalog: ErmrestCatalog,
1825
+ sname: str,
1826
+ tname: str,
1827
+ reachable_rids: set[str],
1828
+ page_size: int,
1829
+ report: "CloneReport",
1830
+ truncate_oversized: bool = False,
1831
+ ) -> tuple[int, int, list[str], list[TruncatedValue]]:
1832
+ """Copy only rows with RIDs in reachable_rids from source to destination.
1833
+
1834
+ Similar to _copy_table_data_with_retry but filters to only reachable RIDs.
1835
+
1836
+ Args:
1837
+ src_catalog: Source catalog connection.
1838
+ dst_catalog: Destination catalog connection.
1839
+ sname: Schema name.
1840
+ tname: Table name.
1841
+ reachable_rids: Set of RIDs to copy.
1842
+ page_size: Number of rows to fetch per request.
1843
+ report: Clone report for recording issues.
1844
+ truncate_oversized: Whether to truncate oversized values.
1845
+
1846
+ Returns:
1847
+ Tuple of (rows_copied, rows_skipped, skipped_rids, truncated_values).
1848
+ """
1849
+ tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
1850
+ table_key = f"{sname}:{tname}"
1851
+
1852
+ MAX_INDEX_VALUE_BYTES = 2600
1853
+ TRUNCATE_SUFFIX = "...[TRUNCATED]"
1854
+
1855
+ rows_copied = 0
1856
+ rows_skipped = 0
1857
+ skipped_rids: list[str] = []
1858
+ truncated_values: list[TruncatedValue] = []
1859
+
1860
+ if not reachable_rids:
1861
+ return 0, 0, [], []
1862
+
1863
+ # Convert RIDs to sorted list for pagination
1864
+ rid_list = sorted(reachable_rids)
1865
+
1866
+ # Process in batches
1867
+ for i in range(0, len(rid_list), page_size):
1868
+ batch_rids = rid_list[i:i + page_size]
1869
+
1870
+ # Build query with RID filter
1871
+ rid_filter = ",".join(urlquote(rid) for rid in batch_rids)
1872
+ try:
1873
+ page = src_catalog.get(f"/entity/{tname_uri}/RID=any({rid_filter})").json()
1874
+ except Exception as e:
1875
+ logger.warning(f"Failed to fetch batch from {table_key}: {e}")
1876
+ rows_skipped += len(batch_rids)
1877
+ skipped_rids.extend(batch_rids)
1878
+ continue
1879
+
1880
+ if not page:
1881
+ continue
1882
+
1883
+ # Optionally truncate oversized values
1884
+ rows_to_insert = []
1885
+ for row in page:
1886
+ if truncate_oversized:
1887
+ modified_row = row.copy()
1888
+ for col, value in row.items():
1889
+ if isinstance(value, str):
1890
+ value_bytes = len(value.encode('utf-8'))
1891
+ if value_bytes > MAX_INDEX_VALUE_BYTES:
1892
+ max_chars = MAX_INDEX_VALUE_BYTES - len(TRUNCATE_SUFFIX.encode('utf-8'))
1893
+ truncated = value[:max_chars] + TRUNCATE_SUFFIX
1894
+ while len(truncated.encode('utf-8')) > MAX_INDEX_VALUE_BYTES:
1895
+ max_chars -= 100
1896
+ truncated = value[:max_chars] + TRUNCATE_SUFFIX
1897
+ modified_row[col] = truncated
1898
+ truncated_values.append(TruncatedValue(
1899
+ table=table_key,
1900
+ rid=str(row.get('RID', 'unknown')),
1901
+ column=col,
1902
+ original_bytes=value_bytes,
1903
+ truncated_bytes=len(truncated.encode('utf-8')),
1904
+ ))
1905
+ rows_to_insert.append(modified_row)
1906
+ else:
1907
+ rows_to_insert.append(row)
1908
+
1909
+ # Insert into destination
1910
+ try:
1911
+ dst_catalog.post(f"/entity/{tname_uri}", json=rows_to_insert)
1912
+ rows_copied += len(rows_to_insert)
1913
+ except Exception as e:
1914
+ error_str = str(e)
1915
+ if "index row size" in error_str.lower() or "btree" in error_str.lower():
1916
+ # Row-by-row fallback for index size errors
1917
+ for row in rows_to_insert:
1918
+ try:
1919
+ dst_catalog.post(f"/entity/{tname_uri}", json=[row])
1920
+ rows_copied += 1
1921
+ except Exception:
1922
+ rows_skipped += 1
1923
+ skipped_rids.append(str(row.get('RID', 'unknown')))
1924
+ else:
1925
+ logger.warning(f"Failed to insert into {table_key}: {e}")
1926
+ rows_skipped += len(rows_to_insert)
1927
+ skipped_rids.extend(str(row.get('RID', 'unknown')) for row in rows_to_insert)
1928
+
1929
+ return rows_copied, rows_skipped, skipped_rids, truncated_values
1930
+
1931
+
949
1932
  def clone_catalog(
950
1933
  source_hostname: str,
951
1934
  source_catalog_id: str,
@@ -1072,7 +2055,7 @@ def clone_catalog(
1072
2055
  clone_timestamp = datetime.now(timezone.utc).isoformat()
1073
2056
 
1074
2057
  # Perform the three-stage clone
1075
- orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values = _clone_three_stage(
2058
+ orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, skipped_rids, truncated_values = _clone_three_stage(
1076
2059
  src_catalog=src_catalog,
1077
2060
  dst_catalog=dst_catalog,
1078
2061
  copy_data=not schema_only,
@@ -1136,8 +2119,13 @@ def clone_catalog(
1136
2119
  asset_mode=asset_mode.value,
1137
2120
  exclude_schemas=exclude_schemas or [],
1138
2121
  exclude_objects=exclude_objects or [],
2122
+ add_ml_schema=add_ml_schema,
2123
+ copy_annotations=copy_annotations,
2124
+ copy_policy=copy_policy,
2125
+ reinitialize_dataset_versions=reinitialize_dataset_versions,
1139
2126
  rows_copied=total_rows_copied,
1140
2127
  rows_skipped=rows_skipped,
2128
+ skipped_rids=skipped_rids,
1141
2129
  truncated_count=len(truncated_values),
1142
2130
  orphan_rows_removed=orphan_rows_removed,
1143
2131
  orphan_rows_nullified=orphan_rows_nullified,
@@ -1186,10 +2174,10 @@ def _clone_three_stage(
1186
2174
  prune_hidden_fkeys: bool,
1187
2175
  truncate_oversized: bool,
1188
2176
  report: CloneReport,
1189
- ) -> tuple[int, int, int, int, list[TruncatedValue]]:
2177
+ ) -> tuple[int, int, int, int, list[str], list[TruncatedValue]]:
1190
2178
  """Perform three-stage catalog cloning.
1191
2179
 
1192
- Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values)
2180
+ Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, skipped_rids, truncated_values)
1193
2181
  """
1194
2182
  src_model = src_catalog.getCatalogModel()
1195
2183
 
@@ -1282,7 +2270,7 @@ def _clone_three_stage(
1282
2270
  report.add_issue(CloneIssue(
1283
2271
  severity=CloneIssueSeverity.WARNING,
1284
2272
  category=CloneIssueCategory.FK_PRUNED,
1285
- message=f"FK pruned due to hidden reference data",
2273
+ message="FK pruned due to hidden reference data",
1286
2274
  table=f"{sname}:{tname}",
1287
2275
  details=f"FK {fk_name} references columns with 'select': null",
1288
2276
  action="Source catalog may have incoherent policies",
@@ -1328,6 +2316,7 @@ def _clone_three_stage(
1328
2316
  # Stage 2: Copy data
1329
2317
  total_rows = 0
1330
2318
  total_rows_skipped = 0
2319
+ all_skipped_rids: list[str] = []
1331
2320
  all_truncated_values: list[TruncatedValue] = []
1332
2321
  deferred_indexes: dict[str, list[dict]] = {} # Track indexes dropped for later rebuild
1333
2322
 
@@ -1343,7 +2332,7 @@ def _clone_three_stage(
1343
2332
  logger.debug(f"Copying data for {table_key}")
1344
2333
 
1345
2334
  # Use the new copy function with index error handling
1346
- table_rows, rows_skipped, truncated = _copy_table_data_with_retry(
2335
+ table_rows, rows_skipped, skipped_rids, truncated = _copy_table_data_with_retry(
1347
2336
  src_catalog=src_catalog,
1348
2337
  dst_catalog=dst_catalog,
1349
2338
  sname=sname,
@@ -1355,6 +2344,7 @@ def _clone_three_stage(
1355
2344
  )
1356
2345
 
1357
2346
  total_rows_skipped += rows_skipped
2347
+ all_skipped_rids.extend(skipped_rids)
1358
2348
  all_truncated_values.extend(truncated)
1359
2349
 
1360
2350
  if table_rows < 0:
@@ -1581,7 +2571,7 @@ def _clone_three_stage(
1581
2571
  if copy_annotations or copy_policy:
1582
2572
  _copy_configuration(src_model, dst_catalog, copy_annotations, copy_policy, exclude_schemas, excluded_tables)
1583
2573
 
1584
- return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_truncated_values
2574
+ return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_skipped_rids, all_truncated_values
1585
2575
 
1586
2576
 
1587
2577
  def _identify_orphan_values(
@@ -1684,7 +2674,7 @@ def _delete_orphan_rows(
1684
2674
  report.add_issue(CloneIssue(
1685
2675
  severity=CloneIssueSeverity.WARNING,
1686
2676
  category=CloneIssueCategory.ORPHAN_ROWS,
1687
- message=f"Orphan rows deleted",
2677
+ message="Orphan rows deleted",
1688
2678
  table=table_key,
1689
2679
  details=f"Missing references to: {ref_key} ({len(orphan_values)})",
1690
2680
  action="Source catalog may have incoherent row-level policies",
@@ -1892,10 +2882,27 @@ def _post_clone_operations(
1892
2882
 
1893
2883
  if add_ml_schema:
1894
2884
  try:
1895
- from deriva_ml.schema import create_ml_schema
1896
2885
  catalog = server.connect_ermrest(result.catalog_id)
1897
2886
  create_ml_schema(catalog)
1898
2887
  result.ml_schema_added = True
2888
+
2889
+ # Apply catalog annotations (chaise-config, navbar, etc.)
2890
+ # Import DerivaML locally to avoid circular import (deriva_ml.__init__ imports from clone.py)
2891
+ try:
2892
+ from deriva_ml import DerivaML
2893
+ ml = DerivaML(result.hostname, result.catalog_id, check_auth=False)
2894
+ ml.apply_catalog_annotations()
2895
+ logger.info("Applied catalog annotations (chaise-config, navbar)")
2896
+ except Exception as e:
2897
+ logger.warning(f"Failed to apply catalog annotations: {e}")
2898
+ if result.report:
2899
+ result.report.add_issue(CloneIssue(
2900
+ severity=CloneIssueSeverity.WARNING,
2901
+ category=CloneIssueCategory.SCHEMA_ISSUE,
2902
+ message="Failed to apply catalog annotations",
2903
+ details=str(e),
2904
+ action="Manually call apply_catalog_annotations() after clone",
2905
+ ))
1899
2906
  except Exception as e:
1900
2907
  logger.warning(f"Failed to add ML schema: {e}")
1901
2908
  if result.report:
@@ -1945,3 +2952,549 @@ def _reinitialize_dataset_versions(
1945
2952
  logger.warning(f"Failed to reinitialize dataset versions: {e}")
1946
2953
 
1947
2954
  return result
2955
+
2956
+
2957
+ # =============================================================================
2958
+ # Clone Subset Catalog
2959
+ # =============================================================================
2960
+
2961
+
2962
+ def clone_subset_catalog(
2963
+ source_hostname: str,
2964
+ source_catalog_id: str,
2965
+ root_rid: str,
2966
+ include_tables: list[str] | None = None,
2967
+ exclude_objects: list[str] | None = None,
2968
+ exclude_schemas: list[str] | None = None,
2969
+ include_associations: bool = True,
2970
+ include_vocabularies: bool = True,
2971
+ use_export_annotation: bool = False,
2972
+ dest_hostname: str | None = None,
2973
+ alias: str | None = None,
2974
+ add_ml_schema: bool = False,
2975
+ asset_mode: AssetCopyMode = AssetCopyMode.REFERENCES,
2976
+ copy_annotations: bool = True,
2977
+ copy_policy: bool = True,
2978
+ source_credential: dict | None = None,
2979
+ dest_credential: dict | None = None,
2980
+ orphan_strategy: OrphanStrategy = OrphanStrategy.FAIL,
2981
+ prune_hidden_fkeys: bool = False,
2982
+ truncate_oversized: bool = False,
2983
+ reinitialize_dataset_versions: bool = True,
2984
+ ) -> CloneCatalogResult:
2985
+ """Clone a subset of a catalog containing only data reachable from a root RID.
2986
+
2987
+ Can use either FK graph traversal or export annotations to determine which
2988
+ tables to include. When use_export_annotation=True, the tables and paths
2989
+ defined in the root table's export annotation (tag:isrd.isi.edu,2019:export)
2990
+ are used, which matches the behavior of the BDBag export button.
2991
+
2992
+ Args:
2993
+ source_hostname: Hostname of the source catalog server.
2994
+ source_catalog_id: ID of the catalog to clone from.
2995
+ root_rid: The starting RID from which to trace reachability.
2996
+ include_tables: Optional list of additional table names ("schema:table"
2997
+ format) to use as starting points for table discovery. If None,
2998
+ discovery starts only from the root RID's table.
2999
+ exclude_objects: List of tables ("schema:table" format) to exclude from
3000
+ cloning. Paths through these tables are not followed.
3001
+ exclude_schemas: List of schema names to exclude entirely from cloning.
3002
+ include_associations: If True, auto-include association tables needed
3003
+ for FK integrity between discovered tables.
3004
+ include_vocabularies: If True, auto-include vocabulary tables referenced
3005
+ by discovered tables.
3006
+ use_export_annotation: If True, use the export annotation on the root
3007
+ table to determine which tables and paths to clone. This matches the
3008
+ behavior of the BDBag export button. If False (default), discover
3009
+ tables via FK graph traversal.
3010
+ dest_hostname: Destination hostname. If None, uses source_hostname.
3011
+ alias: Optional alias for the new catalog.
3012
+ add_ml_schema: If True, add DerivaML schema to clone.
3013
+ asset_mode: How to handle assets (NONE, REFERENCES, or FULL).
3014
+ copy_annotations: If True, copy annotations to clone.
3015
+ copy_policy: If True, copy ACLs/ACL bindings to clone.
3016
+ source_credential: Credentials for source catalog.
3017
+ dest_credential: Credentials for destination catalog.
3018
+ orphan_strategy: How to handle orphan rows (FAIL, DELETE, or NULLIFY).
3019
+ prune_hidden_fkeys: If True, prune FKs with hidden reference data.
3020
+ truncate_oversized: If True, truncate values that exceed index size limits.
3021
+ reinitialize_dataset_versions: If True, reinitialize dataset versions.
3022
+
3023
+ Returns:
3024
+ CloneCatalogResult with details of the cloned catalog.
3025
+
3026
+ Raises:
3027
+ ValueError: If root_rid is not found in any table.
3028
+ ValueError: If include_tables contains invalid table specifications.
3029
+ ValueError: If use_export_annotation=True but no export annotation found.
3030
+
3031
+ Example:
3032
+ >>> # Clone using export annotation (matches BDBag export)
3033
+ >>> result = clone_subset_catalog(
3034
+ ... source_hostname="www.facebase.org",
3035
+ ... source_catalog_id="1",
3036
+ ... root_rid="3-HXMC",
3037
+ ... use_export_annotation=True,
3038
+ ... alias="my-project-clone",
3039
+ ... )
3040
+
3041
+ >>> # Clone all tables reachable from a dataset, excluding Execution table
3042
+ >>> result = clone_subset_catalog(
3043
+ ... source_hostname="www.example.org",
3044
+ ... source_catalog_id="1",
3045
+ ... root_rid="ABC123",
3046
+ ... exclude_objects=["deriva-ml:Execution"],
3047
+ ... alias="my-subset",
3048
+ ... )
3049
+ >>> print(f"Created catalog {result.catalog_id}")
3050
+
3051
+ >>> # Clone with additional starting tables
3052
+ >>> result = clone_subset_catalog(
3053
+ ... source_hostname="www.example.org",
3054
+ ... source_catalog_id="1",
3055
+ ... root_rid="ABC123",
3056
+ ... include_tables=["demo:Configuration"], # Also discover from here
3057
+ ... exclude_schemas=["audit"],
3058
+ ... )
3059
+ """
3060
+ include_tables = include_tables or []
3061
+ exclude_objects = exclude_objects or []
3062
+ exclude_schemas_set = set(exclude_schemas) if exclude_schemas else set()
3063
+
3064
+ # Validate table format for include_tables
3065
+ for table_spec in include_tables:
3066
+ if ":" not in table_spec:
3067
+ raise ValueError(f"Table must be specified as 'schema:table', got: {table_spec}")
3068
+
3069
+ # Parse exclude_objects into set of tuples
3070
+ excluded_tables: set[tuple[str, str]] = set()
3071
+ for table_spec in exclude_objects:
3072
+ if ":" not in table_spec:
3073
+ raise ValueError(f"exclude_objects entries must be 'schema:table', got: {table_spec}")
3074
+ schema, table = table_spec.split(":", 1)
3075
+ excluded_tables.add((schema, table))
3076
+
3077
+ dest_hostname = dest_hostname or source_hostname
3078
+
3079
+ # Get credentials
3080
+ src_cred = source_credential or get_credential(source_hostname)
3081
+ dst_cred = dest_credential or get_credential(dest_hostname)
3082
+
3083
+ # Connect to source catalog
3084
+ src_server = DerivaServer("https", source_hostname, credentials=src_cred)
3085
+ src_catalog = src_server.connect_ermrest(source_catalog_id)
3086
+ src_model = src_catalog.getCatalogModel()
3087
+
3088
+ logger.info(f"Connected to source catalog {source_hostname}/{source_catalog_id}")
3089
+
3090
+ # First, find the table containing the root RID
3091
+ root_table_key = None
3092
+ for sname, schema in src_model.schemas.items():
3093
+ if sname in {"public", "_acl_admin", "WWW"} or sname in exclude_schemas_set:
3094
+ continue
3095
+ for tname, table in schema.tables.items():
3096
+ if (sname, tname) in excluded_tables:
3097
+ continue
3098
+ if table.kind != 'table' or 'RID' not in table.column_definitions.elements:
3099
+ continue
3100
+ try:
3101
+ table_spec = f"{sname}:{tname}"
3102
+ uri = f"/entity/{_quote_table_spec(table_spec)}/RID={urlquote(root_rid)}"
3103
+ result = src_catalog.get(uri).json()
3104
+ if result:
3105
+ root_table_key = table_spec
3106
+ break
3107
+ except Exception:
3108
+ continue
3109
+ if root_table_key:
3110
+ break
3111
+
3112
+ if root_table_key is None:
3113
+ raise ValueError(f"Root RID {root_rid} not found in any accessible table")
3114
+
3115
+ logger.info(f"Root RID {root_rid} found in table {root_table_key}")
3116
+
3117
+ # Get the root table object for export annotation parsing
3118
+ root_schema, root_tname = root_table_key.split(":", 1)
3119
+ root_table_obj = src_model.schemas[root_schema].tables[root_tname]
3120
+
3121
+ # Track paths for efficient RID computation (when using export annotation)
3122
+ export_paths: list[list[str]] = []
3123
+
3124
+ if use_export_annotation:
3125
+ # Use export annotation to determine tables
3126
+ logger.info("Using export annotation to determine tables...")
3127
+ discovered_tables, export_paths = _parse_export_annotation_tables(
3128
+ root_table_obj, []
3129
+ )
3130
+
3131
+ if not discovered_tables or len(discovered_tables) <= 1:
3132
+ raise ValueError(
3133
+ f"No export annotation found on table {root_table_key} or annotation "
3134
+ f"contains no paths. Set use_export_annotation=False to use FK graph traversal."
3135
+ )
3136
+
3137
+ logger.info(f"Export annotation defines {len(discovered_tables)} tables and {len(export_paths)} paths")
3138
+
3139
+ # Add any explicitly included tables
3140
+ for table_spec in (include_tables or []):
3141
+ if table_spec not in discovered_tables:
3142
+ discovered_tables.append(table_spec)
3143
+
3144
+ # Filter out excluded tables
3145
+ discovered_tables = [
3146
+ t for t in discovered_tables
3147
+ if tuple(t.split(":", 1)) not in excluded_tables
3148
+ ]
3149
+ else:
3150
+ # Build starting tables: root table + any explicitly included tables
3151
+ start_tables = [root_table_key]
3152
+ for table_spec in include_tables:
3153
+ if table_spec not in start_tables:
3154
+ start_tables.append(table_spec)
3155
+
3156
+ # Discover all reachable tables from starting points using FK traversal
3157
+ logger.info(f"Discovering tables reachable from {start_tables}...")
3158
+ discovered_tables = _discover_reachable_tables(
3159
+ model=src_model,
3160
+ start_tables=start_tables,
3161
+ exclude_tables=excluded_tables,
3162
+ exclude_schemas=exclude_schemas_set,
3163
+ )
3164
+
3165
+ logger.info(f"Discovered {len(discovered_tables)} connected tables")
3166
+
3167
+ # Expand with associations and vocabularies
3168
+ all_tables = list(discovered_tables)
3169
+ associations_added: list[str] = []
3170
+ vocabularies_added: list[str] = []
3171
+
3172
+ if include_associations:
3173
+ all_tables, associations_added = _expand_tables_with_associations(src_model, all_tables)
3174
+ # Filter out excluded tables from associations
3175
+ associations_added = [
3176
+ t for t in associations_added
3177
+ if tuple(t.split(":", 1)) not in excluded_tables
3178
+ ]
3179
+ all_tables = [
3180
+ t for t in all_tables
3181
+ if tuple(t.split(":", 1)) not in excluded_tables
3182
+ ]
3183
+ if associations_added:
3184
+ logger.info(f"Auto-added association tables: {associations_added}")
3185
+
3186
+ if include_vocabularies:
3187
+ all_tables, vocabularies_added = _expand_tables_with_vocabularies(src_model, all_tables)
3188
+ # Filter out excluded tables from vocabularies
3189
+ vocabularies_added = [
3190
+ t for t in vocabularies_added
3191
+ if tuple(t.split(":", 1)) not in excluded_tables
3192
+ ]
3193
+ all_tables = [
3194
+ t for t in all_tables
3195
+ if tuple(t.split(":", 1)) not in excluded_tables
3196
+ ]
3197
+ if vocabularies_added:
3198
+ logger.info(f"Auto-added vocabulary tables: {vocabularies_added}")
3199
+
3200
+ logger.info(f"Will clone {len(all_tables)} tables: {all_tables}")
3201
+
3202
+ # Compute reachable RIDs
3203
+ logger.info(f"Computing reachable rows from root RID {root_rid}...")
3204
+
3205
+ if use_export_annotation and export_paths:
3206
+ # Use the predefined paths from export annotation (more efficient)
3207
+ # Also pass model to enable FK reference expansion
3208
+ reachable_rids = _compute_reachable_rids_from_paths(
3209
+ src_catalog, root_rid, root_table_key, export_paths, all_tables, src_model
3210
+ )
3211
+ else:
3212
+ # Use FK graph traversal
3213
+ reachable_rids = _compute_reachable_rids(src_catalog, root_rid, all_tables, src_model)
3214
+
3215
+ total_rows = sum(len(rids) for rids in reachable_rids.values())
3216
+ logger.info(f"Found {total_rows} reachable rows across {len(all_tables)} tables")
3217
+
3218
+ for table_spec, rids in reachable_rids.items():
3219
+ if rids:
3220
+ logger.debug(f" {table_spec}: {len(rids)} rows")
3221
+
3222
+ # Create report
3223
+ report = CloneReport()
3224
+
3225
+ # Parse tables into set for quick lookup
3226
+ included_tables: set[tuple[str, str]] = set()
3227
+ for table_spec in all_tables:
3228
+ schema, table = table_spec.split(":", 1)
3229
+ included_tables.add((schema, table))
3230
+
3231
+ # Create destination catalog
3232
+ dst_server = DerivaServer("https", dest_hostname, credentials=dst_cred)
3233
+ dst_catalog = dst_server.create_ermrest_catalog()
3234
+ dst_catalog_id = dst_catalog.catalog_id
3235
+
3236
+ logger.info(f"Created destination catalog {dest_hostname}/{dst_catalog_id}")
3237
+
3238
+ try:
3239
+ # Build model content for included tables only
3240
+ new_model = []
3241
+ fkeys_deferred = []
3242
+ clone_states = {}
3243
+
3244
+ def prune_parts(d, *extra_victims):
3245
+ victims = set(extra_victims)
3246
+ if not copy_annotations:
3247
+ victims |= {'annotations'}
3248
+ if not copy_policy:
3249
+ victims |= {'acls', 'acl_bindings'}
3250
+ for k in victims:
3251
+ d.pop(k, None)
3252
+ return d
3253
+
3254
+ # Collect schemas that have included tables
3255
+ included_schemas: set[str] = {schema for schema, _ in included_tables}
3256
+
3257
+ for sname in included_schemas:
3258
+ if sname not in src_model.schemas:
3259
+ continue
3260
+
3261
+ schema = src_model.schemas[sname]
3262
+ schema_def = prune_parts(schema.prejson(), 'tables')
3263
+ new_model.append(schema_def)
3264
+
3265
+ for tname, table in schema.tables.items():
3266
+ if (sname, tname) not in included_tables:
3267
+ continue
3268
+
3269
+ if table.kind != 'table':
3270
+ continue
3271
+
3272
+ if 'RID' not in table.column_definitions.elements:
3273
+ logger.warning(f"Table {sname}.{tname} lacks system columns, skipping")
3274
+ report.tables_skipped.append(f"{sname}:{tname}")
3275
+ continue
3276
+
3277
+ # Create table definition without FKs
3278
+ table_def = prune_parts(table.prejson(), 'foreign_keys')
3279
+ table_def['column_definitions'] = [
3280
+ prune_parts(c) for c in table_def['column_definitions']
3281
+ ]
3282
+ table_def['keys'] = [prune_parts(k) for k in table_def.get('keys', [])]
3283
+ table_def.setdefault('annotations', {})[_clone_state_url] = 1
3284
+
3285
+ new_model.append(table_def)
3286
+ clone_states[(sname, tname)] = 1
3287
+
3288
+ # Collect FKs (only those between included tables)
3289
+ for fkdef in table.prejson().get('foreign_keys', []):
3290
+ include_fk = True
3291
+ for ref_col in fkdef.get('referenced_columns', []):
3292
+ ref_schema = ref_col.get('schema_name')
3293
+ ref_table = ref_col.get('table_name')
3294
+ if (ref_schema, ref_table) not in included_tables:
3295
+ include_fk = False
3296
+ break
3297
+
3298
+ if include_fk:
3299
+ fkeys_deferred.append((sname, tname, prune_parts(fkdef.copy())))
3300
+
3301
+ # Stage 1: Create schema without FKs
3302
+ logger.info("Stage 1: Creating schema without foreign keys...")
3303
+ if new_model:
3304
+ dst_catalog.post("/schema", json=new_model)
3305
+
3306
+ # Stage 2: Copy filtered data
3307
+ logger.info("Stage 2: Copying filtered data...")
3308
+ total_rows_copied = 0
3309
+ total_rows_skipped = 0
3310
+ all_skipped_rids: list[str] = []
3311
+ all_truncated_values: list[TruncatedValue] = []
3312
+ page_size = 1000
3313
+
3314
+ if use_export_annotation and export_paths:
3315
+ # Use path-based copying to respect row-level security
3316
+ logger.info("Using path-based copying (respects row-level ACLs)...")
3317
+ rows_by_table = _copy_data_via_export_paths(
3318
+ src_catalog=src_catalog,
3319
+ dst_catalog=dst_catalog,
3320
+ root_table=root_table_key,
3321
+ root_rid=root_rid,
3322
+ export_paths=export_paths,
3323
+ all_tables=all_tables,
3324
+ report=report,
3325
+ truncate_oversized=truncate_oversized,
3326
+ page_size=page_size,
3327
+ )
3328
+ for table_key, rows in rows_by_table.items():
3329
+ report.tables_restored[table_key] = rows
3330
+ total_rows_copied += rows
3331
+
3332
+ # Mark all tables complete
3333
+ for (sname, tname), state in clone_states.items():
3334
+ if state == 1:
3335
+ try:
3336
+ dst_catalog.put(
3337
+ f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/annotation/{urlquote(_clone_state_url)}",
3338
+ json=2
3339
+ )
3340
+ except Exception:
3341
+ pass
3342
+ else:
3343
+ # Use RID-based copying (original approach)
3344
+ for (sname, tname), state in clone_states.items():
3345
+ if state != 1:
3346
+ continue
3347
+
3348
+ table_key = f"{sname}:{tname}"
3349
+ table_reachable = reachable_rids.get(table_key, set())
3350
+
3351
+ if not table_reachable:
3352
+ logger.debug(f"No reachable rows for {table_key}")
3353
+ report.tables_restored[table_key] = 0
3354
+ continue
3355
+
3356
+ logger.debug(f"Copying {len(table_reachable)} rows for {table_key}")
3357
+
3358
+ rows_copied, rows_skipped, skipped, truncated = _copy_subset_table_data(
3359
+ src_catalog=src_catalog,
3360
+ dst_catalog=dst_catalog,
3361
+ sname=sname,
3362
+ tname=tname,
3363
+ reachable_rids=table_reachable,
3364
+ page_size=page_size,
3365
+ report=report,
3366
+ truncate_oversized=truncate_oversized,
3367
+ )
3368
+
3369
+ total_rows_copied += rows_copied
3370
+ total_rows_skipped += rows_skipped
3371
+ all_skipped_rids.extend(skipped)
3372
+ all_truncated_values.extend(truncated)
3373
+
3374
+ report.tables_restored[table_key] = rows_copied
3375
+
3376
+ # Mark complete
3377
+ try:
3378
+ dst_catalog.put(
3379
+ f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/annotation/{urlquote(_clone_state_url)}",
3380
+ json=2
3381
+ )
3382
+ except Exception:
3383
+ pass
3384
+
3385
+ logger.info(f"Copied {total_rows_copied} rows, skipped {total_rows_skipped}")
3386
+
3387
+ # Stage 3: Apply FKs
3388
+ logger.info(f"Stage 3: Applying {len(fkeys_deferred)} foreign keys...")
3389
+ fkeys_applied = 0
3390
+ fkeys_failed = 0
3391
+
3392
+ for sname, tname, fkdef in fkeys_deferred:
3393
+ fk_name = fkdef.get('names', [[sname, 'unknown']])[0]
3394
+ try:
3395
+ dst_catalog.post(
3396
+ f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/foreignkey",
3397
+ json=fkdef
3398
+ )
3399
+ fkeys_applied += 1
3400
+ report.fkeys_applied += 1
3401
+ except Exception as e:
3402
+ error_str = str(e)
3403
+ if "violates foreign key constraint" in error_str:
3404
+ if orphan_strategy == OrphanStrategy.FAIL:
3405
+ report.add_issue(CloneIssue(
3406
+ severity=CloneIssueSeverity.ERROR,
3407
+ category=CloneIssueCategory.FK_VIOLATION,
3408
+ message="FK constraint violation",
3409
+ table=f"{sname}:{tname}",
3410
+ details=f"FK {fk_name}: {error_str[:200]}",
3411
+ action="Some reachable rows may have dangling references",
3412
+ ))
3413
+ fkeys_failed += 1
3414
+ report.fkeys_failed += 1
3415
+ else:
3416
+ logger.warning(f"Failed to apply FK {fk_name}: {e}")
3417
+ fkeys_failed += 1
3418
+ report.fkeys_failed += 1
3419
+
3420
+ logger.info(f"Applied {fkeys_applied} FKs, failed {fkeys_failed}")
3421
+
3422
+ # Build result
3423
+ result = CloneCatalogResult(
3424
+ catalog_id=dst_catalog_id,
3425
+ hostname=dest_hostname,
3426
+ schema_only=False,
3427
+ asset_mode=asset_mode,
3428
+ source_hostname=source_hostname,
3429
+ source_catalog_id=source_catalog_id,
3430
+ source_snapshot=None,
3431
+ alias=alias,
3432
+ orphan_rows_removed=0,
3433
+ orphan_rows_nullified=0,
3434
+ fkeys_pruned=0,
3435
+ rows_skipped=total_rows_skipped,
3436
+ truncated_values=all_truncated_values,
3437
+ report=report,
3438
+ )
3439
+
3440
+ # Post-clone operations
3441
+ if alias:
3442
+ try:
3443
+ dst_server.create_ermrest_alias(id=alias, alias_target=str(dst_catalog_id))
3444
+ result.alias = alias
3445
+ except Exception as e:
3446
+ logger.warning(f"Failed to create alias '{alias}': {e}")
3447
+
3448
+ if add_ml_schema:
3449
+ try:
3450
+ create_ml_schema(dst_catalog)
3451
+ result.ml_schema_added = True
3452
+
3453
+ # Apply catalog annotations (chaise-config, navbar, etc.)
3454
+ # Import DerivaML locally to avoid circular import (deriva_ml.__init__ imports from clone.py)
3455
+ try:
3456
+ from deriva_ml import DerivaML
3457
+ ml = DerivaML(dest_hostname, str(dst_catalog_id), check_auth=False)
3458
+ ml.apply_catalog_annotations()
3459
+ logger.info("Applied catalog annotations (chaise-config, navbar)")
3460
+ except Exception as e:
3461
+ logger.warning(f"Failed to apply catalog annotations: {e}")
3462
+ except Exception as e:
3463
+ logger.warning(f"Failed to add ML schema: {e}")
3464
+
3465
+ if reinitialize_dataset_versions and "deriva-ml" in src_model.schemas:
3466
+ result = _reinitialize_dataset_versions(result, dst_cred)
3467
+
3468
+ # Set defaultTable to the root table for partial clones
3469
+ # This ensures the Chaise UI has a valid landing page
3470
+ try:
3471
+ chaise_config_url = "tag:isrd.isi.edu,2019:chaise-config"
3472
+ dst_model = dst_catalog.getCatalogModel()
3473
+ dst_model.annotations[chaise_config_url] = dst_model.annotations.get(chaise_config_url, {})
3474
+ # Chaise expects defaultTable as an object with schema and table keys
3475
+ root_schema, root_tname = root_table_key.split(":", 1)
3476
+ dst_model.annotations[chaise_config_url]["defaultTable"] = {
3477
+ "schema": root_schema,
3478
+ "table": root_tname,
3479
+ }
3480
+ dst_model.apply()
3481
+ logger.info(f"Set defaultTable to {root_table_key}")
3482
+ except Exception as e:
3483
+ logger.warning(f"Failed to set defaultTable annotation: {e}")
3484
+
3485
+ logger.info(
3486
+ f"Subset clone complete: {dest_hostname}/{dst_catalog_id} "
3487
+ f"({total_rows_copied} rows in {len(clone_states)} tables)"
3488
+ )
3489
+
3490
+ return result
3491
+
3492
+ except Exception as e:
3493
+ # Clean up on failure
3494
+ logger.error(f"Clone failed: {e}")
3495
+ try:
3496
+ dst_server.delete_ermrest_catalog(dst_catalog_id)
3497
+ logger.info(f"Cleaned up failed catalog {dst_catalog_id}")
3498
+ except Exception:
3499
+ pass
3500
+ raise