deriva-ml 1.17.14__py3-none-any.whl → 1.17.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +2 -2
- deriva_ml/asset/asset.py +0 -4
- deriva_ml/catalog/__init__.py +6 -0
- deriva_ml/catalog/clone.py +1591 -38
- deriva_ml/catalog/localize.py +66 -29
- deriva_ml/core/base.py +12 -9
- deriva_ml/core/definitions.py +13 -12
- deriva_ml/core/ermrest.py +11 -12
- deriva_ml/core/mixins/annotation.py +2 -2
- deriva_ml/core/mixins/asset.py +3 -3
- deriva_ml/core/mixins/dataset.py +3 -3
- deriva_ml/core/mixins/execution.py +1 -0
- deriva_ml/core/mixins/feature.py +2 -2
- deriva_ml/core/mixins/file.py +2 -2
- deriva_ml/core/mixins/path_builder.py +2 -2
- deriva_ml/core/mixins/rid_resolution.py +2 -2
- deriva_ml/core/mixins/vocabulary.py +2 -2
- deriva_ml/core/mixins/workflow.py +3 -3
- deriva_ml/dataset/catalog_graph.py +3 -4
- deriva_ml/dataset/dataset.py +5 -3
- deriva_ml/dataset/dataset_bag.py +0 -2
- deriva_ml/dataset/upload.py +2 -2
- deriva_ml/demo_catalog.py +0 -1
- deriva_ml/execution/__init__.py +8 -8
- deriva_ml/execution/base_config.py +2 -2
- deriva_ml/execution/execution.py +5 -3
- deriva_ml/execution/execution_record.py +0 -1
- deriva_ml/execution/model_protocol.py +1 -1
- deriva_ml/execution/multirun_config.py +0 -1
- deriva_ml/execution/runner.py +3 -3
- deriva_ml/experiment/experiment.py +3 -3
- deriva_ml/feature.py +2 -2
- deriva_ml/interfaces.py +2 -2
- deriva_ml/model/__init__.py +45 -24
- deriva_ml/model/annotations.py +0 -1
- deriva_ml/model/catalog.py +3 -2
- deriva_ml/model/data_loader.py +330 -0
- deriva_ml/model/data_sources.py +439 -0
- deriva_ml/model/database.py +216 -32
- deriva_ml/model/fk_orderer.py +379 -0
- deriva_ml/model/handles.py +1 -1
- deriva_ml/model/schema_builder.py +816 -0
- deriva_ml/run_model.py +3 -3
- deriva_ml/schema/annotations.py +2 -1
- deriva_ml/schema/create_schema.py +1 -1
- deriva_ml/schema/validation.py +1 -1
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/METADATA +1 -1
- deriva_ml-1.17.16.dist-info/RECORD +81 -0
- deriva_ml-1.17.14.dist-info/RECORD +0 -77
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/WHEEL +0 -0
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/top_level.txt +0 -0
deriva_ml/catalog/clone.py
CHANGED
|
@@ -20,7 +20,7 @@ from __future__ import annotations
|
|
|
20
20
|
|
|
21
21
|
import json
|
|
22
22
|
import logging
|
|
23
|
-
from dataclasses import dataclass, field
|
|
23
|
+
from dataclasses import asdict, dataclass, field
|
|
24
24
|
from datetime import datetime, timezone
|
|
25
25
|
from enum import Enum
|
|
26
26
|
from typing import Any
|
|
@@ -29,6 +29,9 @@ from urllib.parse import quote as urlquote
|
|
|
29
29
|
from deriva.core import DerivaServer, ErmrestCatalog, get_credential
|
|
30
30
|
from deriva.core.hatrac_store import HatracStore
|
|
31
31
|
|
|
32
|
+
from deriva_ml.model.catalog import VOCAB_COLUMNS
|
|
33
|
+
from deriva_ml.schema import create_ml_schema
|
|
34
|
+
|
|
32
35
|
logger = logging.getLogger("deriva_ml")
|
|
33
36
|
|
|
34
37
|
|
|
@@ -87,9 +90,10 @@ class CloneIssue:
|
|
|
87
90
|
details: str | None = None
|
|
88
91
|
action: str | None = None
|
|
89
92
|
row_count: int = 0
|
|
93
|
+
skipped_rids: list[str] | None = None # RIDs of rows that were skipped
|
|
90
94
|
|
|
91
95
|
def to_dict(self) -> dict[str, Any]:
|
|
92
|
-
|
|
96
|
+
result = {
|
|
93
97
|
"severity": self.severity.value,
|
|
94
98
|
"category": self.category.value,
|
|
95
99
|
"message": self.message,
|
|
@@ -98,6 +102,9 @@ class CloneIssue:
|
|
|
98
102
|
"action": self.action,
|
|
99
103
|
"row_count": self.row_count,
|
|
100
104
|
}
|
|
105
|
+
if self.skipped_rids:
|
|
106
|
+
result["skipped_rids"] = self.skipped_rids
|
|
107
|
+
return result
|
|
101
108
|
|
|
102
109
|
def __str__(self) -> str:
|
|
103
110
|
parts = [f"[{self.severity.value.upper()}]"]
|
|
@@ -106,7 +113,32 @@ class CloneIssue:
|
|
|
106
113
|
parts.append(self.message)
|
|
107
114
|
if self.row_count > 0:
|
|
108
115
|
parts.append(f"({self.row_count} rows)")
|
|
109
|
-
|
|
116
|
+
result = " ".join(parts)
|
|
117
|
+
if self.skipped_rids:
|
|
118
|
+
# For small numbers, list the RIDs; for large numbers, just show count
|
|
119
|
+
if len(self.skipped_rids) <= 5:
|
|
120
|
+
result += f"\n Skipped RIDs: {', '.join(self.skipped_rids)}"
|
|
121
|
+
else:
|
|
122
|
+
result += f"\n Skipped RIDs: {len(self.skipped_rids)} rows (see JSON for full list)"
|
|
123
|
+
return result
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass
|
|
127
|
+
class CloneReportSummary:
|
|
128
|
+
"""Summary statistics for a clone operation."""
|
|
129
|
+
|
|
130
|
+
total_issues: int
|
|
131
|
+
errors: int
|
|
132
|
+
warnings: int
|
|
133
|
+
tables_restored: int
|
|
134
|
+
tables_failed: int
|
|
135
|
+
tables_skipped: int
|
|
136
|
+
total_rows_restored: int
|
|
137
|
+
orphan_rows_removed: int
|
|
138
|
+
orphan_rows_nullified: int
|
|
139
|
+
fkeys_applied: int
|
|
140
|
+
fkeys_failed: int
|
|
141
|
+
fkeys_pruned: int
|
|
110
142
|
|
|
111
143
|
|
|
112
144
|
@dataclass
|
|
@@ -134,27 +166,32 @@ class CloneReport:
|
|
|
134
166
|
def add_issue(self, issue: CloneIssue) -> None:
|
|
135
167
|
self.issues.append(issue)
|
|
136
168
|
|
|
169
|
+
@property
|
|
170
|
+
def summary(self) -> CloneReportSummary:
|
|
171
|
+
"""Return summary statistics as a dataclass."""
|
|
172
|
+
return CloneReportSummary(
|
|
173
|
+
total_issues=len(self.issues),
|
|
174
|
+
errors=len([i for i in self.issues if i.severity == CloneIssueSeverity.ERROR]),
|
|
175
|
+
warnings=len([i for i in self.issues if i.severity == CloneIssueSeverity.WARNING]),
|
|
176
|
+
tables_restored=len(self.tables_restored),
|
|
177
|
+
tables_failed=len(self.tables_failed),
|
|
178
|
+
tables_skipped=len(self.tables_skipped),
|
|
179
|
+
total_rows_restored=sum(self.tables_restored.values()),
|
|
180
|
+
orphan_rows_removed=sum(
|
|
181
|
+
d.get("rows_removed", 0) for d in self.orphan_details.values()
|
|
182
|
+
),
|
|
183
|
+
orphan_rows_nullified=sum(
|
|
184
|
+
d.get("rows_nullified", 0) for d in self.orphan_details.values()
|
|
185
|
+
),
|
|
186
|
+
fkeys_applied=self.fkeys_applied,
|
|
187
|
+
fkeys_failed=self.fkeys_failed,
|
|
188
|
+
fkeys_pruned=self.fkeys_pruned,
|
|
189
|
+
)
|
|
190
|
+
|
|
137
191
|
def to_dict(self) -> dict[str, Any]:
|
|
138
192
|
"""Return the report as a JSON-serializable dictionary."""
|
|
139
193
|
return {
|
|
140
|
-
"summary":
|
|
141
|
-
"total_issues": len(self.issues),
|
|
142
|
-
"errors": len([i for i in self.issues if i.severity == CloneIssueSeverity.ERROR]),
|
|
143
|
-
"warnings": len([i for i in self.issues if i.severity == CloneIssueSeverity.WARNING]),
|
|
144
|
-
"tables_restored": len(self.tables_restored),
|
|
145
|
-
"tables_failed": len(self.tables_failed),
|
|
146
|
-
"tables_skipped": len(self.tables_skipped),
|
|
147
|
-
"total_rows_restored": sum(self.tables_restored.values()),
|
|
148
|
-
"orphan_rows_removed": sum(
|
|
149
|
-
d.get("rows_removed", 0) for d in self.orphan_details.values()
|
|
150
|
-
),
|
|
151
|
-
"orphan_rows_nullified": sum(
|
|
152
|
-
d.get("rows_nullified", 0) for d in self.orphan_details.values()
|
|
153
|
-
),
|
|
154
|
-
"fkeys_applied": self.fkeys_applied,
|
|
155
|
-
"fkeys_failed": self.fkeys_failed,
|
|
156
|
-
"fkeys_pruned": self.fkeys_pruned,
|
|
157
|
-
},
|
|
194
|
+
"summary": asdict(self.summary),
|
|
158
195
|
"issues": [i.to_dict() for i in self.issues],
|
|
159
196
|
"tables_restored": self.tables_restored,
|
|
160
197
|
"tables_failed": self.tables_failed,
|
|
@@ -332,6 +369,7 @@ class CloneDetails:
|
|
|
332
369
|
source_catalog_id: str
|
|
333
370
|
source_snapshot: str | None = None
|
|
334
371
|
source_schema_url: str | None = None # Hatrac URL to source schema JSON
|
|
372
|
+
# Clone parameters
|
|
335
373
|
orphan_strategy: str = "fail"
|
|
336
374
|
truncate_oversized: bool = False
|
|
337
375
|
prune_hidden_fkeys: bool = False
|
|
@@ -339,15 +377,21 @@ class CloneDetails:
|
|
|
339
377
|
asset_mode: str = "refs"
|
|
340
378
|
exclude_schemas: list[str] = field(default_factory=list)
|
|
341
379
|
exclude_objects: list[str] = field(default_factory=list)
|
|
380
|
+
add_ml_schema: bool = False
|
|
381
|
+
copy_annotations: bool = True
|
|
382
|
+
copy_policy: bool = True
|
|
383
|
+
reinitialize_dataset_versions: bool = True
|
|
384
|
+
# Statistics
|
|
342
385
|
rows_copied: int = 0
|
|
343
386
|
rows_skipped: int = 0
|
|
387
|
+
skipped_rids: list[str] = field(default_factory=list) # RIDs of skipped rows
|
|
344
388
|
truncated_count: int = 0
|
|
345
389
|
orphan_rows_removed: int = 0
|
|
346
390
|
orphan_rows_nullified: int = 0
|
|
347
391
|
fkeys_pruned: int = 0
|
|
348
392
|
|
|
349
393
|
def to_dict(self) -> dict[str, Any]:
|
|
350
|
-
|
|
394
|
+
result = {
|
|
351
395
|
"source_hostname": self.source_hostname,
|
|
352
396
|
"source_catalog_id": self.source_catalog_id,
|
|
353
397
|
"source_snapshot": self.source_snapshot,
|
|
@@ -359,6 +403,10 @@ class CloneDetails:
|
|
|
359
403
|
"asset_mode": self.asset_mode,
|
|
360
404
|
"exclude_schemas": self.exclude_schemas,
|
|
361
405
|
"exclude_objects": self.exclude_objects,
|
|
406
|
+
"add_ml_schema": self.add_ml_schema,
|
|
407
|
+
"copy_annotations": self.copy_annotations,
|
|
408
|
+
"copy_policy": self.copy_policy,
|
|
409
|
+
"reinitialize_dataset_versions": self.reinitialize_dataset_versions,
|
|
362
410
|
"rows_copied": self.rows_copied,
|
|
363
411
|
"rows_skipped": self.rows_skipped,
|
|
364
412
|
"truncated_count": self.truncated_count,
|
|
@@ -366,6 +414,9 @@ class CloneDetails:
|
|
|
366
414
|
"orphan_rows_nullified": self.orphan_rows_nullified,
|
|
367
415
|
"fkeys_pruned": self.fkeys_pruned,
|
|
368
416
|
}
|
|
417
|
+
if self.skipped_rids:
|
|
418
|
+
result["skipped_rids"] = self.skipped_rids
|
|
419
|
+
return result
|
|
369
420
|
|
|
370
421
|
@classmethod
|
|
371
422
|
def from_dict(cls, data: dict[str, Any]) -> "CloneDetails":
|
|
@@ -381,8 +432,13 @@ class CloneDetails:
|
|
|
381
432
|
asset_mode=data.get("asset_mode", "refs"),
|
|
382
433
|
exclude_schemas=data.get("exclude_schemas", []),
|
|
383
434
|
exclude_objects=data.get("exclude_objects", []),
|
|
435
|
+
add_ml_schema=data.get("add_ml_schema", False),
|
|
436
|
+
copy_annotations=data.get("copy_annotations", True),
|
|
437
|
+
copy_policy=data.get("copy_policy", True),
|
|
438
|
+
reinitialize_dataset_versions=data.get("reinitialize_dataset_versions", True),
|
|
384
439
|
rows_copied=data.get("rows_copied", 0),
|
|
385
440
|
rows_skipped=data.get("rows_skipped", 0),
|
|
441
|
+
skipped_rids=data.get("skipped_rids", []),
|
|
386
442
|
truncated_count=data.get("truncated_count", 0),
|
|
387
443
|
orphan_rows_removed=data.get("orphan_rows_removed", 0),
|
|
388
444
|
orphan_rows_nullified=data.get("orphan_rows_nullified", 0),
|
|
@@ -677,7 +733,7 @@ def _copy_table_data_with_retry(
|
|
|
677
733
|
report: "CloneReport",
|
|
678
734
|
deferred_indexes: dict[str, list[dict]],
|
|
679
735
|
truncate_oversized: bool = False,
|
|
680
|
-
) -> tuple[int, int, list[TruncatedValue]]:
|
|
736
|
+
) -> tuple[int, int, list[str], list[TruncatedValue]]:
|
|
681
737
|
"""Copy data for a single table with retry logic for index errors.
|
|
682
738
|
|
|
683
739
|
If a btree index size error occurs, this function will:
|
|
@@ -698,7 +754,7 @@ def _copy_table_data_with_retry(
|
|
|
698
754
|
truncate_oversized: If True, truncate oversized values instead of skipping rows.
|
|
699
755
|
|
|
700
756
|
Returns:
|
|
701
|
-
Tuple of (rows_copied, rows_skipped, truncated_values).
|
|
757
|
+
Tuple of (rows_copied, rows_skipped, skipped_rids, truncated_values).
|
|
702
758
|
rows_copied is -1 if the copy failed entirely.
|
|
703
759
|
"""
|
|
704
760
|
tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
|
|
@@ -711,6 +767,7 @@ def _copy_table_data_with_retry(
|
|
|
711
767
|
last = None
|
|
712
768
|
table_rows = 0
|
|
713
769
|
rows_skipped = 0
|
|
770
|
+
skipped_rids: list[str] = [] # Track RIDs of skipped rows
|
|
714
771
|
truncated_values: list[TruncatedValue] = []
|
|
715
772
|
row_by_row_mode = False
|
|
716
773
|
problematic_index = None
|
|
@@ -768,7 +825,7 @@ def _copy_table_data_with_retry(
|
|
|
768
825
|
).json()
|
|
769
826
|
except Exception as e:
|
|
770
827
|
logger.warning(f"Failed to read from {sname}:{tname}: {e}")
|
|
771
|
-
return -1, rows_skipped, truncated_values
|
|
828
|
+
return -1, rows_skipped, skipped_rids, truncated_values
|
|
772
829
|
|
|
773
830
|
if not page:
|
|
774
831
|
break
|
|
@@ -809,11 +866,14 @@ def _copy_table_data_with_retry(
|
|
|
809
866
|
|
|
810
867
|
rows_skipped += 1
|
|
811
868
|
rid = row.get('RID', 'unknown')
|
|
869
|
+
skipped_rids.append(rid)
|
|
812
870
|
logger.debug(f"Skipping row {rid} in {table_key} due to index size limit")
|
|
813
871
|
else:
|
|
814
872
|
# Different error - log and skip
|
|
815
873
|
rows_skipped += 1
|
|
816
|
-
|
|
874
|
+
rid = row.get('RID', 'unknown')
|
|
875
|
+
skipped_rids.append(rid)
|
|
876
|
+
logger.debug(f"Skipping row {rid} in {table_key}: {row_error}")
|
|
817
877
|
last = page[-1]['RID']
|
|
818
878
|
else:
|
|
819
879
|
# Normal batch mode
|
|
@@ -884,14 +944,17 @@ def _copy_table_data_with_retry(
|
|
|
884
944
|
|
|
885
945
|
rows_skipped += 1
|
|
886
946
|
rid = row.get('RID', 'unknown')
|
|
947
|
+
skipped_rids.append(rid)
|
|
887
948
|
logger.debug(f"Skipping row {rid} due to index size limit")
|
|
888
949
|
else:
|
|
889
950
|
rows_skipped += 1
|
|
890
|
-
|
|
951
|
+
rid = row.get('RID', 'unknown')
|
|
952
|
+
skipped_rids.append(rid)
|
|
953
|
+
logger.debug(f"Skipping row {rid}: {row_error}")
|
|
891
954
|
last = page[-1]['RID']
|
|
892
955
|
else:
|
|
893
956
|
logger.warning(f"Failed to write to {sname}:{tname}: {e}")
|
|
894
|
-
return -1, rows_skipped, truncated_values
|
|
957
|
+
return -1, rows_skipped, skipped_rids, truncated_values
|
|
895
958
|
|
|
896
959
|
# Report skipped rows
|
|
897
960
|
if rows_skipped > 0:
|
|
@@ -903,8 +966,9 @@ def _copy_table_data_with_retry(
|
|
|
903
966
|
details=f"Index '{problematic_index}' on column '{problematic_column}'",
|
|
904
967
|
action="These rows have values too large for btree index (>2704 bytes)",
|
|
905
968
|
row_count=rows_skipped,
|
|
969
|
+
skipped_rids=skipped_rids if skipped_rids else None,
|
|
906
970
|
))
|
|
907
|
-
logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits")
|
|
971
|
+
logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits: RIDs={skipped_rids}")
|
|
908
972
|
|
|
909
973
|
# Report truncated values
|
|
910
974
|
if truncated_values:
|
|
@@ -919,7 +983,7 @@ def _copy_table_data_with_retry(
|
|
|
919
983
|
))
|
|
920
984
|
logger.info(f"Truncated {len(truncated_values)} values in {table_key}")
|
|
921
985
|
|
|
922
|
-
return table_rows, rows_skipped, truncated_values
|
|
986
|
+
return table_rows, rows_skipped, skipped_rids, truncated_values
|
|
923
987
|
|
|
924
988
|
|
|
925
989
|
|
|
@@ -946,6 +1010,925 @@ def _rebuild_deferred_indexes(
|
|
|
946
1010
|
logger.info(f"Reporting {sum(len(v) for v in deferred_indexes.values())} index issues...")
|
|
947
1011
|
|
|
948
1012
|
|
|
1013
|
+
# =============================================================================
|
|
1014
|
+
# Subset Clone Helpers
|
|
1015
|
+
# =============================================================================
|
|
1016
|
+
|
|
1017
|
+
|
|
1018
|
+
# Export annotation tag
|
|
1019
|
+
_export_tag = "tag:isrd.isi.edu,2019:export"
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
def _parse_export_annotation_tables(
|
|
1023
|
+
table: Any,
|
|
1024
|
+
paths_discovered: list[list[str]] | None = None,
|
|
1025
|
+
) -> tuple[list[str], list[list[str]]]:
|
|
1026
|
+
"""Parse export annotation from a table to extract tables and paths.
|
|
1027
|
+
|
|
1028
|
+
The export annotation (tag:isrd.isi.edu,2019:export) defines which tables
|
|
1029
|
+
should be exported when downloading a row as a BDBag. This function extracts
|
|
1030
|
+
the table names from the annotation paths.
|
|
1031
|
+
|
|
1032
|
+
Args:
|
|
1033
|
+
table: ERMrest Table object with annotations.
|
|
1034
|
+
paths_discovered: Optional list to append discovered paths to (for reuse).
|
|
1035
|
+
|
|
1036
|
+
Returns:
|
|
1037
|
+
Tuple of (tables_list, paths_list) where:
|
|
1038
|
+
- tables_list: List of table names in "schema:table" format
|
|
1039
|
+
- paths_list: List of paths, each path is a list of "schema:table" strings
|
|
1040
|
+
"""
|
|
1041
|
+
if paths_discovered is None:
|
|
1042
|
+
paths_discovered = []
|
|
1043
|
+
|
|
1044
|
+
tables: set[str] = set()
|
|
1045
|
+
|
|
1046
|
+
# Add the root table itself
|
|
1047
|
+
root_table_spec = f"{table.schema.name}:{table.name}"
|
|
1048
|
+
tables.add(root_table_spec)
|
|
1049
|
+
|
|
1050
|
+
# Get the export annotation
|
|
1051
|
+
export_annotation = table.annotations.get(_export_tag, {})
|
|
1052
|
+
|
|
1053
|
+
# Export annotations can have multiple contexts (*, detailed, etc.)
|
|
1054
|
+
# We'll look at all of them
|
|
1055
|
+
for context_key, context_value in export_annotation.items():
|
|
1056
|
+
templates = context_value.get("templates", [])
|
|
1057
|
+
for template in templates:
|
|
1058
|
+
outputs = template.get("outputs", [])
|
|
1059
|
+
for output in outputs:
|
|
1060
|
+
source = output.get("source", {})
|
|
1061
|
+
path_str = source.get("path", "")
|
|
1062
|
+
|
|
1063
|
+
if not path_str:
|
|
1064
|
+
continue
|
|
1065
|
+
|
|
1066
|
+
# Parse the path - it's in ERMrest format like "schema:table/schema:table2/..."
|
|
1067
|
+
# Split by "/" and parse each segment
|
|
1068
|
+
path_segments = path_str.split("/")
|
|
1069
|
+
current_path: list[str] = [root_table_spec]
|
|
1070
|
+
|
|
1071
|
+
for segment in path_segments:
|
|
1072
|
+
# Skip empty segments
|
|
1073
|
+
if not segment:
|
|
1074
|
+
continue
|
|
1075
|
+
|
|
1076
|
+
# Skip attribute projections (contain ":" followed by "=")
|
|
1077
|
+
if "=" in segment:
|
|
1078
|
+
continue
|
|
1079
|
+
|
|
1080
|
+
# Parse schema:table format
|
|
1081
|
+
if ":" in segment:
|
|
1082
|
+
# Could be "schema:table" or complex path syntax
|
|
1083
|
+
# For simple schema:table, just add it
|
|
1084
|
+
parts = segment.split(":")
|
|
1085
|
+
if len(parts) == 2 and not any(c in segment for c in ["(", ")", "!", "@"]):
|
|
1086
|
+
schema, tname = parts
|
|
1087
|
+
table_spec = f"{schema}:{tname}"
|
|
1088
|
+
tables.add(table_spec)
|
|
1089
|
+
current_path.append(table_spec)
|
|
1090
|
+
|
|
1091
|
+
if len(current_path) > 1:
|
|
1092
|
+
paths_discovered.append(current_path)
|
|
1093
|
+
|
|
1094
|
+
return sorted(tables), paths_discovered
|
|
1095
|
+
|
|
1096
|
+
|
|
1097
|
+
def _compute_reachable_rids_from_paths(
|
|
1098
|
+
catalog: ErmrestCatalog,
|
|
1099
|
+
root_rid: str,
|
|
1100
|
+
root_table: str,
|
|
1101
|
+
paths: list[list[str]],
|
|
1102
|
+
include_tables: list[str],
|
|
1103
|
+
model: Any | None = None,
|
|
1104
|
+
) -> dict[str, set[str]]:
|
|
1105
|
+
"""Compute RIDs reachable from root_rid using predefined paths.
|
|
1106
|
+
|
|
1107
|
+
This is more efficient than FK graph traversal because it uses the paths
|
|
1108
|
+
defined in the export annotation, which are already known to work.
|
|
1109
|
+
|
|
1110
|
+
After following the paths, also discovers FK references from reachable rows
|
|
1111
|
+
back to tables in the include list. This ensures FK integrity by including
|
|
1112
|
+
referenced rows that weren't found via the export paths.
|
|
1113
|
+
|
|
1114
|
+
Args:
|
|
1115
|
+
catalog: Source catalog connection.
|
|
1116
|
+
root_rid: Starting RID.
|
|
1117
|
+
root_table: Root table in "schema:table" format.
|
|
1118
|
+
paths: List of paths from export annotation, each path is a list of
|
|
1119
|
+
"schema:table" strings starting with the root table.
|
|
1120
|
+
include_tables: All tables to track reachability for.
|
|
1121
|
+
model: Optional ERMrest Model for FK relationship discovery.
|
|
1122
|
+
|
|
1123
|
+
Returns:
|
|
1124
|
+
Dict mapping "schema:table" -> set of reachable RIDs.
|
|
1125
|
+
"""
|
|
1126
|
+
# Initialize reachable sets for all tables
|
|
1127
|
+
reachable: dict[str, set[str]] = {t: set() for t in include_tables}
|
|
1128
|
+
reachable[root_table].add(root_rid)
|
|
1129
|
+
|
|
1130
|
+
# Query each path from the export annotation
|
|
1131
|
+
for path in paths:
|
|
1132
|
+
if len(path) < 2:
|
|
1133
|
+
continue
|
|
1134
|
+
|
|
1135
|
+
# Build ERMrest query following the path
|
|
1136
|
+
# Start with the root table and RID filter
|
|
1137
|
+
query = f"/entity/{_quote_table_spec(root_table)}/RID={urlquote(root_rid)}"
|
|
1138
|
+
|
|
1139
|
+
# Add each step in the path (skip the root table)
|
|
1140
|
+
for table_spec in path[1:]:
|
|
1141
|
+
query += f"/{_quote_table_spec(table_spec)}"
|
|
1142
|
+
|
|
1143
|
+
# Query for rows at the end of the path
|
|
1144
|
+
target_table = path[-1]
|
|
1145
|
+
if target_table not in reachable:
|
|
1146
|
+
continue
|
|
1147
|
+
|
|
1148
|
+
try:
|
|
1149
|
+
result = catalog.get(query).json()
|
|
1150
|
+
for row in result:
|
|
1151
|
+
if "RID" in row:
|
|
1152
|
+
reachable[target_table].add(row["RID"])
|
|
1153
|
+
if result:
|
|
1154
|
+
logger.debug(f"Path {' -> '.join(path)}: found {len(result)} rows")
|
|
1155
|
+
except Exception as e:
|
|
1156
|
+
logger.debug(f"Path query failed: {query}: {e}")
|
|
1157
|
+
continue
|
|
1158
|
+
|
|
1159
|
+
# Note: FK reference expansion was too slow for large datasets and is disabled.
|
|
1160
|
+
# Instead, rely on orphan_strategy (DELETE/NULLIFY) to handle any FK violations
|
|
1161
|
+
# that occur when referenced rows weren't found via the export paths.
|
|
1162
|
+
|
|
1163
|
+
return reachable
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
def _expand_reachable_via_fk_references(
|
|
1167
|
+
catalog: ErmrestCatalog,
|
|
1168
|
+
reachable: dict[str, set[str]],
|
|
1169
|
+
include_tables: list[str],
|
|
1170
|
+
model: Any,
|
|
1171
|
+
) -> None:
|
|
1172
|
+
"""Expand reachable RIDs by following FK references.
|
|
1173
|
+
|
|
1174
|
+
For each table with reachable rows, find FK columns that reference other
|
|
1175
|
+
included tables and add the referenced RIDs to the reachable set.
|
|
1176
|
+
|
|
1177
|
+
Args:
|
|
1178
|
+
catalog: Source catalog connection.
|
|
1179
|
+
reachable: Dict mapping "schema:table" -> set of RIDs (modified in place).
|
|
1180
|
+
include_tables: Tables to include.
|
|
1181
|
+
model: ERMrest Model object.
|
|
1182
|
+
"""
|
|
1183
|
+
# Build table lookup
|
|
1184
|
+
table_lookup: dict[tuple[str, str], str] = {}
|
|
1185
|
+
for table_spec in include_tables:
|
|
1186
|
+
schema, table_name = table_spec.split(":", 1)
|
|
1187
|
+
table_lookup[(schema, table_name)] = table_spec
|
|
1188
|
+
|
|
1189
|
+
# Iterate until no new RIDs are discovered
|
|
1190
|
+
max_iterations = 10 # Prevent infinite loops
|
|
1191
|
+
iteration = 0
|
|
1192
|
+
|
|
1193
|
+
while iteration < max_iterations:
|
|
1194
|
+
iteration += 1
|
|
1195
|
+
new_rids_found = False
|
|
1196
|
+
|
|
1197
|
+
for table_spec in include_tables:
|
|
1198
|
+
current_rids = reachable.get(table_spec, set())
|
|
1199
|
+
if not current_rids:
|
|
1200
|
+
continue
|
|
1201
|
+
|
|
1202
|
+
schema, table_name = table_spec.split(":", 1)
|
|
1203
|
+
try:
|
|
1204
|
+
table = model.schemas[schema].tables[table_name]
|
|
1205
|
+
except KeyError:
|
|
1206
|
+
continue
|
|
1207
|
+
|
|
1208
|
+
# Check each FK for references to other included tables
|
|
1209
|
+
for fk in table.foreign_keys:
|
|
1210
|
+
pk_table = fk.pk_table
|
|
1211
|
+
pk_key = (pk_table.schema.name, pk_table.name)
|
|
1212
|
+
pk_spec = table_lookup.get(pk_key)
|
|
1213
|
+
|
|
1214
|
+
if not pk_spec:
|
|
1215
|
+
continue # Target table not in our include list
|
|
1216
|
+
|
|
1217
|
+
# Get the FK column name
|
|
1218
|
+
if not fk.foreign_key_columns:
|
|
1219
|
+
continue
|
|
1220
|
+
fk_col = fk.foreign_key_columns[0].name
|
|
1221
|
+
|
|
1222
|
+
# Query for FK values from reachable rows
|
|
1223
|
+
# Do this in batches to avoid URL length limits
|
|
1224
|
+
# Ensure all RIDs are strings
|
|
1225
|
+
rids_list = [str(r) for r in current_rids if r is not None]
|
|
1226
|
+
batch_size = 100
|
|
1227
|
+
|
|
1228
|
+
for i in range(0, len(rids_list), batch_size):
|
|
1229
|
+
batch = rids_list[i:i + batch_size]
|
|
1230
|
+
rid_filter = ",".join(urlquote(r) for r in batch)
|
|
1231
|
+
|
|
1232
|
+
try:
|
|
1233
|
+
# Get distinct FK values
|
|
1234
|
+
query = f"/attributegroup/{_quote_table_spec(table_spec)}/RID=any({rid_filter})/{urlquote(fk_col)}"
|
|
1235
|
+
result = catalog.get(query).json()
|
|
1236
|
+
|
|
1237
|
+
for row in result:
|
|
1238
|
+
fk_value = row.get(fk_col)
|
|
1239
|
+
if fk_value is not None:
|
|
1240
|
+
# Ensure FK value is a string
|
|
1241
|
+
fk_value_str = str(fk_value)
|
|
1242
|
+
if fk_value_str not in reachable[pk_spec]:
|
|
1243
|
+
reachable[pk_spec].add(fk_value_str)
|
|
1244
|
+
new_rids_found = True
|
|
1245
|
+
except Exception as e:
|
|
1246
|
+
logger.debug(f"FK reference query failed: {e}")
|
|
1247
|
+
continue
|
|
1248
|
+
|
|
1249
|
+
if not new_rids_found:
|
|
1250
|
+
break
|
|
1251
|
+
|
|
1252
|
+
if iteration > 1:
|
|
1253
|
+
logger.debug(f"FK reference expansion completed in {iteration} iterations")
|
|
1254
|
+
|
|
1255
|
+
|
|
1256
|
+
def _expand_tables_with_associations(
|
|
1257
|
+
model: Any,
|
|
1258
|
+
include_tables: list[str],
|
|
1259
|
+
) -> tuple[list[str], list[str]]:
|
|
1260
|
+
"""Expand table list to include association tables needed for FK integrity.
|
|
1261
|
+
|
|
1262
|
+
Given a list of tables, finds all association tables that connect pairs
|
|
1263
|
+
of included tables and adds them to the list.
|
|
1264
|
+
|
|
1265
|
+
Args:
|
|
1266
|
+
model: ERMrest Model object.
|
|
1267
|
+
include_tables: List of table names in "schema:table" format.
|
|
1268
|
+
|
|
1269
|
+
Returns:
|
|
1270
|
+
Tuple of (all_tables, association_tables_added) where:
|
|
1271
|
+
- all_tables: Original tables plus added association tables
|
|
1272
|
+
- association_tables_added: Just the association tables that were added
|
|
1273
|
+
"""
|
|
1274
|
+
# Parse table names to (schema, table) tuples
|
|
1275
|
+
included_set: set[tuple[str, str]] = set()
|
|
1276
|
+
for table_spec in include_tables:
|
|
1277
|
+
if ":" in table_spec:
|
|
1278
|
+
schema, table = table_spec.split(":", 1)
|
|
1279
|
+
included_set.add((schema, table))
|
|
1280
|
+
else:
|
|
1281
|
+
raise ValueError(f"Table must be specified as 'schema:table', got: {table_spec}")
|
|
1282
|
+
|
|
1283
|
+
# Find association tables connecting included tables
|
|
1284
|
+
associations_added: list[str] = []
|
|
1285
|
+
|
|
1286
|
+
for schema_name, table_name in list(included_set):
|
|
1287
|
+
try:
|
|
1288
|
+
table = model.schemas[schema_name].tables[table_name]
|
|
1289
|
+
except KeyError:
|
|
1290
|
+
continue
|
|
1291
|
+
|
|
1292
|
+
# Check for associations from this table
|
|
1293
|
+
for assoc in table.find_associations(pure=False):
|
|
1294
|
+
assoc_table = assoc.table
|
|
1295
|
+
assoc_key = (assoc_table.schema.name, assoc_table.name)
|
|
1296
|
+
|
|
1297
|
+
# Already included
|
|
1298
|
+
if assoc_key in included_set:
|
|
1299
|
+
continue
|
|
1300
|
+
|
|
1301
|
+
# Check if the other end of the association is in our included set
|
|
1302
|
+
for other_fk in assoc.other_fkeys:
|
|
1303
|
+
other_table = other_fk.pk_table
|
|
1304
|
+
other_key = (other_table.schema.name, other_table.name)
|
|
1305
|
+
|
|
1306
|
+
if other_key in included_set:
|
|
1307
|
+
# This association connects two included tables
|
|
1308
|
+
included_set.add(assoc_key)
|
|
1309
|
+
assoc_spec = f"{assoc_key[0]}:{assoc_key[1]}"
|
|
1310
|
+
if assoc_spec not in associations_added:
|
|
1311
|
+
associations_added.append(assoc_spec)
|
|
1312
|
+
break
|
|
1313
|
+
|
|
1314
|
+
all_tables = list(include_tables) + associations_added
|
|
1315
|
+
return all_tables, associations_added
|
|
1316
|
+
|
|
1317
|
+
|
|
1318
|
+
def _expand_tables_with_vocabularies(
|
|
1319
|
+
model: Any,
|
|
1320
|
+
include_tables: list[str],
|
|
1321
|
+
) -> tuple[list[str], list[str]]:
|
|
1322
|
+
"""Expand table list to include vocabulary tables referenced by included tables.
|
|
1323
|
+
|
|
1324
|
+
Examines FK targets of included tables and adds any that are vocabulary tables.
|
|
1325
|
+
|
|
1326
|
+
Args:
|
|
1327
|
+
model: ERMrest Model object.
|
|
1328
|
+
include_tables: List of table names in "schema:table" format.
|
|
1329
|
+
|
|
1330
|
+
Returns:
|
|
1331
|
+
Tuple of (all_tables, vocabulary_tables_added) where:
|
|
1332
|
+
- all_tables: Original tables plus added vocabulary tables
|
|
1333
|
+
- vocabulary_tables_added: Just the vocabulary tables that were added
|
|
1334
|
+
"""
|
|
1335
|
+
def is_vocabulary(table) -> bool:
|
|
1336
|
+
return VOCAB_COLUMNS.issubset({c.name.upper() for c in table.columns})
|
|
1337
|
+
|
|
1338
|
+
# Parse table names
|
|
1339
|
+
included_set: set[tuple[str, str]] = set()
|
|
1340
|
+
for table_spec in include_tables:
|
|
1341
|
+
if ":" in table_spec:
|
|
1342
|
+
schema, table = table_spec.split(":", 1)
|
|
1343
|
+
included_set.add((schema, table))
|
|
1344
|
+
|
|
1345
|
+
vocabularies_added: list[str] = []
|
|
1346
|
+
|
|
1347
|
+
for schema_name, table_name in list(included_set):
|
|
1348
|
+
try:
|
|
1349
|
+
table = model.schemas[schema_name].tables[table_name]
|
|
1350
|
+
except KeyError:
|
|
1351
|
+
continue
|
|
1352
|
+
|
|
1353
|
+
# Check FK targets for vocabulary tables
|
|
1354
|
+
for fk in table.foreign_keys:
|
|
1355
|
+
pk_table = fk.pk_table
|
|
1356
|
+
pk_key = (pk_table.schema.name, pk_table.name)
|
|
1357
|
+
|
|
1358
|
+
if pk_key in included_set:
|
|
1359
|
+
continue
|
|
1360
|
+
|
|
1361
|
+
if is_vocabulary(pk_table):
|
|
1362
|
+
included_set.add(pk_key)
|
|
1363
|
+
vocab_spec = f"{pk_key[0]}:{pk_key[1]}"
|
|
1364
|
+
if vocab_spec not in vocabularies_added:
|
|
1365
|
+
vocabularies_added.append(vocab_spec)
|
|
1366
|
+
|
|
1367
|
+
all_tables = list(include_tables) + vocabularies_added
|
|
1368
|
+
return all_tables, vocabularies_added
|
|
1369
|
+
|
|
1370
|
+
|
|
1371
|
+
def _quote_table_spec(table_spec: str) -> str:
|
|
1372
|
+
"""URL-quote a table specification for ERMrest queries.
|
|
1373
|
+
|
|
1374
|
+
ERMrest uses schema:table format where the colon must NOT be encoded.
|
|
1375
|
+
This function quotes the schema and table names separately.
|
|
1376
|
+
|
|
1377
|
+
Args:
|
|
1378
|
+
table_spec: Table specification in "schema:table" format.
|
|
1379
|
+
|
|
1380
|
+
Returns:
|
|
1381
|
+
URL-safe string with schema and table quoted but colon preserved.
|
|
1382
|
+
"""
|
|
1383
|
+
schema, table = table_spec.split(":", 1)
|
|
1384
|
+
return f"{urlquote(schema)}:{urlquote(table)}"
|
|
1385
|
+
|
|
1386
|
+
|
|
1387
|
+
def _discover_reachable_tables(
|
|
1388
|
+
model: Any,
|
|
1389
|
+
start_tables: list[str],
|
|
1390
|
+
exclude_tables: set[tuple[str, str]] | None = None,
|
|
1391
|
+
exclude_schemas: set[str] | None = None,
|
|
1392
|
+
) -> list[str]:
|
|
1393
|
+
"""Discover all tables reachable from start tables via FK relationships.
|
|
1394
|
+
|
|
1395
|
+
Traverses FK graph in both directions (outbound and inbound FKs) to find
|
|
1396
|
+
all connected tables, excluding system schemas and specified exclusions.
|
|
1397
|
+
|
|
1398
|
+
Args:
|
|
1399
|
+
model: ERMrest Model object.
|
|
1400
|
+
start_tables: Starting tables in "schema:table" format.
|
|
1401
|
+
exclude_tables: Set of (schema, table) tuples to exclude from discovery.
|
|
1402
|
+
exclude_schemas: Set of schema names to exclude entirely.
|
|
1403
|
+
|
|
1404
|
+
Returns:
|
|
1405
|
+
List of reachable table names in "schema:table" format.
|
|
1406
|
+
"""
|
|
1407
|
+
exclude_tables = exclude_tables or set()
|
|
1408
|
+
exclude_schemas = exclude_schemas or set()
|
|
1409
|
+
|
|
1410
|
+
# System schemas to always exclude
|
|
1411
|
+
system_schemas = {"public", "_acl_admin", "WWW"}
|
|
1412
|
+
all_excluded_schemas = system_schemas | exclude_schemas
|
|
1413
|
+
|
|
1414
|
+
# Parse start tables
|
|
1415
|
+
discovered: set[tuple[str, str]] = set()
|
|
1416
|
+
to_visit: list[tuple[str, str]] = []
|
|
1417
|
+
|
|
1418
|
+
for table_spec in start_tables:
|
|
1419
|
+
if ":" not in table_spec:
|
|
1420
|
+
raise ValueError(f"Table must be specified as 'schema:table', got: {table_spec}")
|
|
1421
|
+
schema, table = table_spec.split(":", 1)
|
|
1422
|
+
key = (schema, table)
|
|
1423
|
+
if key not in exclude_tables and schema not in all_excluded_schemas:
|
|
1424
|
+
discovered.add(key)
|
|
1425
|
+
to_visit.append(key)
|
|
1426
|
+
|
|
1427
|
+
# BFS traversal of FK graph
|
|
1428
|
+
while to_visit:
|
|
1429
|
+
current_key = to_visit.pop(0)
|
|
1430
|
+
schema_name, table_name = current_key
|
|
1431
|
+
|
|
1432
|
+
try:
|
|
1433
|
+
table = model.schemas[schema_name].tables[table_name]
|
|
1434
|
+
except KeyError:
|
|
1435
|
+
continue
|
|
1436
|
+
|
|
1437
|
+
# Find connected tables via outbound FKs (this table references other tables)
|
|
1438
|
+
for fk in table.foreign_keys:
|
|
1439
|
+
pk_table = fk.pk_table
|
|
1440
|
+
pk_key = (pk_table.schema.name, pk_table.name)
|
|
1441
|
+
|
|
1442
|
+
if pk_key in discovered or pk_key in exclude_tables:
|
|
1443
|
+
continue
|
|
1444
|
+
if pk_table.schema.name in all_excluded_schemas:
|
|
1445
|
+
continue
|
|
1446
|
+
|
|
1447
|
+
discovered.add(pk_key)
|
|
1448
|
+
to_visit.append(pk_key)
|
|
1449
|
+
|
|
1450
|
+
# Find connected tables via inbound FKs (other tables reference this table)
|
|
1451
|
+
for fk in table.referenced_by:
|
|
1452
|
+
ref_table = fk.table
|
|
1453
|
+
ref_key = (ref_table.schema.name, ref_table.name)
|
|
1454
|
+
|
|
1455
|
+
if ref_key in discovered or ref_key in exclude_tables:
|
|
1456
|
+
continue
|
|
1457
|
+
if ref_table.schema.name in all_excluded_schemas:
|
|
1458
|
+
continue
|
|
1459
|
+
|
|
1460
|
+
discovered.add(ref_key)
|
|
1461
|
+
to_visit.append(ref_key)
|
|
1462
|
+
|
|
1463
|
+
# Convert to schema:table format
|
|
1464
|
+
return [f"{schema}:{table}" for schema, table in sorted(discovered)]
|
|
1465
|
+
|
|
1466
|
+
|
|
1467
|
+
def _build_path_query(
|
|
1468
|
+
root_table: str,
|
|
1469
|
+
root_rid: str,
|
|
1470
|
+
path: list[tuple[str, str]],
|
|
1471
|
+
) -> str:
|
|
1472
|
+
"""Build an ERMrest path query to traverse FK relationships.
|
|
1473
|
+
|
|
1474
|
+
Args:
|
|
1475
|
+
root_table: Starting table in "schema:table" format.
|
|
1476
|
+
root_rid: RID of the starting row.
|
|
1477
|
+
path: List of (schema, table) tuples representing the path.
|
|
1478
|
+
|
|
1479
|
+
Returns:
|
|
1480
|
+
ERMrest query string like "/entity/Schema:Table/RID=X/Schema:Next/..."
|
|
1481
|
+
"""
|
|
1482
|
+
query = f"/entity/{_quote_table_spec(root_table)}/RID={urlquote(root_rid)}"
|
|
1483
|
+
for schema, table in path:
|
|
1484
|
+
query += f"/{urlquote(schema)}:{urlquote(table)}"
|
|
1485
|
+
return query
|
|
1486
|
+
|
|
1487
|
+
|
|
1488
|
+
def _compute_reachable_rids(
|
|
1489
|
+
catalog: ErmrestCatalog,
|
|
1490
|
+
root_rid: str,
|
|
1491
|
+
include_tables: list[str],
|
|
1492
|
+
model: Any,
|
|
1493
|
+
) -> dict[str, set[str]]:
|
|
1494
|
+
"""Compute RIDs reachable from root_rid for each included table.
|
|
1495
|
+
|
|
1496
|
+
Uses FK graph traversal (both directions) to find all rows that are
|
|
1497
|
+
connected to the root row through FK relationships.
|
|
1498
|
+
|
|
1499
|
+
Args:
|
|
1500
|
+
catalog: Source catalog connection.
|
|
1501
|
+
root_rid: Starting RID.
|
|
1502
|
+
include_tables: Tables to compute reachability for ("schema:table" format).
|
|
1503
|
+
model: ERMrest Model object.
|
|
1504
|
+
|
|
1505
|
+
Returns:
|
|
1506
|
+
Dict mapping "schema:table" -> set of reachable RIDs.
|
|
1507
|
+
"""
|
|
1508
|
+
# First, resolve the root RID to find its table
|
|
1509
|
+
root_table_key = None
|
|
1510
|
+
root_table = None
|
|
1511
|
+
|
|
1512
|
+
for table_spec in include_tables:
|
|
1513
|
+
schema, table_name = table_spec.split(":", 1)
|
|
1514
|
+
try:
|
|
1515
|
+
uri = f"/entity/{_quote_table_spec(table_spec)}/RID={urlquote(root_rid)}"
|
|
1516
|
+
result = catalog.get(uri).json()
|
|
1517
|
+
if result:
|
|
1518
|
+
root_table_key = table_spec
|
|
1519
|
+
root_table = model.schemas[schema].tables[table_name]
|
|
1520
|
+
break
|
|
1521
|
+
except Exception:
|
|
1522
|
+
continue
|
|
1523
|
+
|
|
1524
|
+
if root_table_key is None:
|
|
1525
|
+
raise ValueError(f"Root RID {root_rid} not found in any of the included tables")
|
|
1526
|
+
|
|
1527
|
+
# Initialize reachable sets
|
|
1528
|
+
reachable: dict[str, set[str]] = {t: set() for t in include_tables}
|
|
1529
|
+
reachable[root_table_key].add(root_rid)
|
|
1530
|
+
|
|
1531
|
+
# Parse include_tables to lookup dict
|
|
1532
|
+
table_lookup: dict[tuple[str, str], str] = {}
|
|
1533
|
+
for table_spec in include_tables:
|
|
1534
|
+
schema, table_name = table_spec.split(":", 1)
|
|
1535
|
+
table_lookup[(schema, table_name)] = table_spec
|
|
1536
|
+
|
|
1537
|
+
# Build paths from root table using FK traversal (both directions)
|
|
1538
|
+
def find_paths(
|
|
1539
|
+
start_table: Any,
|
|
1540
|
+
visited: set[tuple[str, str]],
|
|
1541
|
+
current_path: list[tuple[str, str]],
|
|
1542
|
+
) -> list[list[tuple[str, str]]]:
|
|
1543
|
+
"""Recursively find all FK paths from start_table to included tables."""
|
|
1544
|
+
paths = []
|
|
1545
|
+
|
|
1546
|
+
# Get all connected tables (both FK directions)
|
|
1547
|
+
connected = []
|
|
1548
|
+
|
|
1549
|
+
# Outbound FKs (this table references other tables)
|
|
1550
|
+
for fk in start_table.foreign_keys:
|
|
1551
|
+
pk_table = fk.pk_table
|
|
1552
|
+
pk_key = (pk_table.schema.name, pk_table.name)
|
|
1553
|
+
if pk_key not in visited and pk_key in table_lookup:
|
|
1554
|
+
connected.append(pk_table)
|
|
1555
|
+
|
|
1556
|
+
# Inbound FKs (other tables reference this table)
|
|
1557
|
+
for fk in start_table.referenced_by:
|
|
1558
|
+
ref_table = fk.table
|
|
1559
|
+
ref_key = (ref_table.schema.name, ref_table.name)
|
|
1560
|
+
if ref_key not in visited and ref_key in table_lookup:
|
|
1561
|
+
connected.append(ref_table)
|
|
1562
|
+
|
|
1563
|
+
for next_table in connected:
|
|
1564
|
+
next_key = (next_table.schema.name, next_table.name)
|
|
1565
|
+
new_path = current_path + [next_key]
|
|
1566
|
+
|
|
1567
|
+
# This path reaches the target table
|
|
1568
|
+
paths.append(new_path)
|
|
1569
|
+
|
|
1570
|
+
# Continue exploring from this table
|
|
1571
|
+
new_visited = visited | {next_key}
|
|
1572
|
+
paths.extend(find_paths(next_table, new_visited, new_path))
|
|
1573
|
+
|
|
1574
|
+
return paths
|
|
1575
|
+
|
|
1576
|
+
# Find all paths from root table
|
|
1577
|
+
root_key = (root_table.schema.name, root_table.name)
|
|
1578
|
+
all_paths = find_paths(root_table, {root_key}, [])
|
|
1579
|
+
|
|
1580
|
+
# For each path, query for reachable rows
|
|
1581
|
+
for path in all_paths:
|
|
1582
|
+
if not path:
|
|
1583
|
+
continue
|
|
1584
|
+
|
|
1585
|
+
target_key = path[-1]
|
|
1586
|
+
target_spec = table_lookup.get(target_key)
|
|
1587
|
+
if not target_spec:
|
|
1588
|
+
continue
|
|
1589
|
+
|
|
1590
|
+
# Build and execute the path query
|
|
1591
|
+
query = _build_path_query(root_table_key, root_rid, path)
|
|
1592
|
+
try:
|
|
1593
|
+
result = catalog.get(query).json()
|
|
1594
|
+
for row in result:
|
|
1595
|
+
if "RID" in row:
|
|
1596
|
+
reachable[target_spec].add(row["RID"])
|
|
1597
|
+
except Exception as e:
|
|
1598
|
+
logger.debug(f"Path query failed: {query}: {e}")
|
|
1599
|
+
continue
|
|
1600
|
+
|
|
1601
|
+
# Also need to check if reachable rows reference other reachable rows
|
|
1602
|
+
# This handles transitive reachability through association tables
|
|
1603
|
+
changed = True
|
|
1604
|
+
while changed:
|
|
1605
|
+
changed = False
|
|
1606
|
+
for table_spec in include_tables:
|
|
1607
|
+
schema, table_name = table_spec.split(":", 1)
|
|
1608
|
+
try:
|
|
1609
|
+
table = model.schemas[schema].tables[table_name]
|
|
1610
|
+
except KeyError:
|
|
1611
|
+
continue
|
|
1612
|
+
|
|
1613
|
+
current_rids = reachable[table_spec].copy()
|
|
1614
|
+
|
|
1615
|
+
# Check FKs from this table
|
|
1616
|
+
for fk in table.foreign_keys:
|
|
1617
|
+
pk_table = fk.pk_table
|
|
1618
|
+
pk_spec = f"{pk_table.schema.name}:{pk_table.name}"
|
|
1619
|
+
if pk_spec not in include_tables:
|
|
1620
|
+
continue
|
|
1621
|
+
|
|
1622
|
+
fk_col = fk.foreign_key_columns[0].name
|
|
1623
|
+
|
|
1624
|
+
# For each reachable row in this table, find the referenced row
|
|
1625
|
+
for rid in current_rids:
|
|
1626
|
+
try:
|
|
1627
|
+
row = catalog.get(f"/entity/{_quote_table_spec(table_spec)}/RID={urlquote(rid)}").json()
|
|
1628
|
+
if row and fk_col in row[0] and row[0][fk_col]:
|
|
1629
|
+
ref_rid = row[0][fk_col]
|
|
1630
|
+
if ref_rid not in reachable[pk_spec]:
|
|
1631
|
+
reachable[pk_spec].add(ref_rid)
|
|
1632
|
+
changed = True
|
|
1633
|
+
except Exception:
|
|
1634
|
+
continue
|
|
1635
|
+
|
|
1636
|
+
# Check FKs to this table (inbound)
|
|
1637
|
+
for fk in table.referenced_by:
|
|
1638
|
+
ref_table = fk.table
|
|
1639
|
+
ref_spec = f"{ref_table.schema.name}:{ref_table.name}"
|
|
1640
|
+
if ref_spec not in include_tables:
|
|
1641
|
+
continue
|
|
1642
|
+
|
|
1643
|
+
fk_col = fk.foreign_key_columns[0].name
|
|
1644
|
+
|
|
1645
|
+
# For each reachable row in this table, find rows that reference it
|
|
1646
|
+
for rid in current_rids:
|
|
1647
|
+
try:
|
|
1648
|
+
result = catalog.get(
|
|
1649
|
+
f"/entity/{_quote_table_spec(ref_spec)}/{urlquote(fk_col)}={urlquote(rid)}"
|
|
1650
|
+
).json()
|
|
1651
|
+
for row in result:
|
|
1652
|
+
if "RID" in row and row["RID"] not in reachable[ref_spec]:
|
|
1653
|
+
reachable[ref_spec].add(row["RID"])
|
|
1654
|
+
changed = True
|
|
1655
|
+
except Exception:
|
|
1656
|
+
continue
|
|
1657
|
+
|
|
1658
|
+
return reachable
|
|
1659
|
+
|
|
1660
|
+
|
|
1661
|
+
def _copy_data_via_export_paths(
|
|
1662
|
+
src_catalog: ErmrestCatalog,
|
|
1663
|
+
dst_catalog: ErmrestCatalog,
|
|
1664
|
+
root_table: str,
|
|
1665
|
+
root_rid: str,
|
|
1666
|
+
export_paths: list[list[str]],
|
|
1667
|
+
all_tables: list[str],
|
|
1668
|
+
report: "CloneReport",
|
|
1669
|
+
truncate_oversized: bool = False,
|
|
1670
|
+
page_size: int = 1000,
|
|
1671
|
+
) -> dict[str, int]:
|
|
1672
|
+
"""Copy data using export paths to respect row-level security.
|
|
1673
|
+
|
|
1674
|
+
Instead of computing reachable RIDs and fetching them individually (which can
|
|
1675
|
+
fail due to row-level ACLs), this function copies data by following the export
|
|
1676
|
+
paths directly. This ensures we only copy rows that are actually visible.
|
|
1677
|
+
|
|
1678
|
+
Args:
|
|
1679
|
+
src_catalog: Source catalog connection.
|
|
1680
|
+
dst_catalog: Destination catalog connection.
|
|
1681
|
+
root_table: Root table in "schema:table" format.
|
|
1682
|
+
root_rid: Root RID to start from.
|
|
1683
|
+
export_paths: Paths from export annotation.
|
|
1684
|
+
all_tables: All tables to copy (for vocabulary tables not in paths).
|
|
1685
|
+
report: Clone report for recording issues.
|
|
1686
|
+
truncate_oversized: Whether to truncate oversized values.
|
|
1687
|
+
page_size: Number of rows per batch.
|
|
1688
|
+
|
|
1689
|
+
Returns:
|
|
1690
|
+
Dict mapping table spec -> rows copied.
|
|
1691
|
+
"""
|
|
1692
|
+
MAX_INDEX_VALUE_BYTES = 2600
|
|
1693
|
+
TRUNCATE_SUFFIX = "...[TRUNCATED]"
|
|
1694
|
+
rows_by_table: dict[str, int] = {}
|
|
1695
|
+
|
|
1696
|
+
def truncate_row(row: dict) -> tuple[dict, list[TruncatedValue]]:
|
|
1697
|
+
"""Truncate oversized values in a row."""
|
|
1698
|
+
truncated_values = []
|
|
1699
|
+
modified = row.copy()
|
|
1700
|
+
for col, value in row.items():
|
|
1701
|
+
if isinstance(value, str):
|
|
1702
|
+
value_bytes = len(value.encode('utf-8'))
|
|
1703
|
+
if value_bytes > MAX_INDEX_VALUE_BYTES:
|
|
1704
|
+
max_chars = MAX_INDEX_VALUE_BYTES - len(TRUNCATE_SUFFIX.encode('utf-8'))
|
|
1705
|
+
truncated = value[:max_chars] + TRUNCATE_SUFFIX
|
|
1706
|
+
while len(truncated.encode('utf-8')) > MAX_INDEX_VALUE_BYTES:
|
|
1707
|
+
max_chars -= 100
|
|
1708
|
+
truncated = value[:max_chars] + TRUNCATE_SUFFIX
|
|
1709
|
+
modified[col] = truncated
|
|
1710
|
+
truncated_values.append(TruncatedValue(
|
|
1711
|
+
table="",
|
|
1712
|
+
rid=str(row.get('RID', 'unknown')),
|
|
1713
|
+
column=col,
|
|
1714
|
+
original_bytes=value_bytes,
|
|
1715
|
+
truncated_bytes=len(truncated.encode('utf-8')),
|
|
1716
|
+
))
|
|
1717
|
+
return modified, truncated_values
|
|
1718
|
+
|
|
1719
|
+
def copy_with_path(path_query: str, table_spec: str) -> int:
|
|
1720
|
+
"""Copy data using a path query, returning rows copied."""
|
|
1721
|
+
sname, tname = table_spec.split(":", 1)
|
|
1722
|
+
tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
|
|
1723
|
+
rows_copied = 0
|
|
1724
|
+
last_rid = ""
|
|
1725
|
+
|
|
1726
|
+
while True:
|
|
1727
|
+
# Add pagination
|
|
1728
|
+
if last_rid:
|
|
1729
|
+
query = f"{path_query}@sort(RID)@after({urlquote(last_rid)})?limit={page_size}"
|
|
1730
|
+
else:
|
|
1731
|
+
query = f"{path_query}@sort(RID)?limit={page_size}"
|
|
1732
|
+
|
|
1733
|
+
try:
|
|
1734
|
+
page = src_catalog.get(query).json()
|
|
1735
|
+
except Exception as e:
|
|
1736
|
+
logger.warning(f"Path query failed {path_query}: {e}")
|
|
1737
|
+
break
|
|
1738
|
+
|
|
1739
|
+
if not page:
|
|
1740
|
+
break
|
|
1741
|
+
|
|
1742
|
+
# Process rows
|
|
1743
|
+
rows_to_insert = []
|
|
1744
|
+
for row in page:
|
|
1745
|
+
if truncate_oversized:
|
|
1746
|
+
modified, _ = truncate_row(row)
|
|
1747
|
+
rows_to_insert.append(modified)
|
|
1748
|
+
else:
|
|
1749
|
+
rows_to_insert.append(row)
|
|
1750
|
+
|
|
1751
|
+
# Insert
|
|
1752
|
+
try:
|
|
1753
|
+
dst_catalog.post(f"/entity/{tname_uri}", json=rows_to_insert)
|
|
1754
|
+
rows_copied += len(rows_to_insert)
|
|
1755
|
+
except Exception as e:
|
|
1756
|
+
# Try row-by-row on failure
|
|
1757
|
+
for row in rows_to_insert:
|
|
1758
|
+
try:
|
|
1759
|
+
dst_catalog.post(f"/entity/{tname_uri}", json=[row])
|
|
1760
|
+
rows_copied += 1
|
|
1761
|
+
except Exception:
|
|
1762
|
+
logger.debug(f"Failed to insert row: {e}")
|
|
1763
|
+
|
|
1764
|
+
if len(page) < page_size:
|
|
1765
|
+
break
|
|
1766
|
+
last_rid = page[-1].get("RID", "")
|
|
1767
|
+
if not last_rid:
|
|
1768
|
+
break
|
|
1769
|
+
|
|
1770
|
+
return rows_copied
|
|
1771
|
+
|
|
1772
|
+
# Copy root table (just the root row)
|
|
1773
|
+
root_sname, root_tname = root_table.split(":", 1)
|
|
1774
|
+
root_uri = f"{urlquote(root_sname)}:{urlquote(root_tname)}"
|
|
1775
|
+
try:
|
|
1776
|
+
root_row = src_catalog.get(f"/entity/{root_uri}/RID={urlquote(root_rid)}").json()
|
|
1777
|
+
if root_row:
|
|
1778
|
+
dst_catalog.post(f"/entity/{root_uri}", json=root_row)
|
|
1779
|
+
rows_by_table[root_table] = 1
|
|
1780
|
+
logger.info(f"Copied 1 row for {root_table}")
|
|
1781
|
+
except Exception as e:
|
|
1782
|
+
logger.warning(f"Failed to copy root row: {e}")
|
|
1783
|
+
|
|
1784
|
+
# Copy data for each export path
|
|
1785
|
+
tables_copied = {root_table}
|
|
1786
|
+
for path in export_paths:
|
|
1787
|
+
if len(path) < 2:
|
|
1788
|
+
continue
|
|
1789
|
+
|
|
1790
|
+
# Build the path query starting from root
|
|
1791
|
+
query = f"/entity/{_quote_table_spec(root_table)}/RID={urlquote(root_rid)}"
|
|
1792
|
+
for table_spec in path[1:]:
|
|
1793
|
+
query += f"/{_quote_table_spec(table_spec)}"
|
|
1794
|
+
|
|
1795
|
+
target_table = path[-1]
|
|
1796
|
+
if target_table in tables_copied:
|
|
1797
|
+
continue
|
|
1798
|
+
|
|
1799
|
+
rows = copy_with_path(query, target_table)
|
|
1800
|
+
rows_by_table[target_table] = rows_by_table.get(target_table, 0) + rows
|
|
1801
|
+
tables_copied.add(target_table)
|
|
1802
|
+
logger.info(f"Copied {rows} rows for {target_table}")
|
|
1803
|
+
|
|
1804
|
+
# Copy vocabulary tables (full copy since they're not in paths)
|
|
1805
|
+
for table_spec in all_tables:
|
|
1806
|
+
if table_spec in tables_copied:
|
|
1807
|
+
continue
|
|
1808
|
+
|
|
1809
|
+
# Check if it's a vocabulary table
|
|
1810
|
+
sname, tname = table_spec.split(":", 1)
|
|
1811
|
+
if sname.startswith("vocab") or "vocab" in sname.lower():
|
|
1812
|
+
# Full copy of vocabulary table
|
|
1813
|
+
query = f"/entity/{_quote_table_spec(table_spec)}"
|
|
1814
|
+
rows = copy_with_path(query, table_spec)
|
|
1815
|
+
rows_by_table[table_spec] = rows
|
|
1816
|
+
tables_copied.add(table_spec)
|
|
1817
|
+
logger.info(f"Copied {rows} rows for vocabulary table {table_spec}")
|
|
1818
|
+
|
|
1819
|
+
return rows_by_table
|
|
1820
|
+
|
|
1821
|
+
|
|
1822
|
+
def _copy_subset_table_data(
|
|
1823
|
+
src_catalog: ErmrestCatalog,
|
|
1824
|
+
dst_catalog: ErmrestCatalog,
|
|
1825
|
+
sname: str,
|
|
1826
|
+
tname: str,
|
|
1827
|
+
reachable_rids: set[str],
|
|
1828
|
+
page_size: int,
|
|
1829
|
+
report: "CloneReport",
|
|
1830
|
+
truncate_oversized: bool = False,
|
|
1831
|
+
) -> tuple[int, int, list[str], list[TruncatedValue]]:
|
|
1832
|
+
"""Copy only rows with RIDs in reachable_rids from source to destination.
|
|
1833
|
+
|
|
1834
|
+
Similar to _copy_table_data_with_retry but filters to only reachable RIDs.
|
|
1835
|
+
|
|
1836
|
+
Args:
|
|
1837
|
+
src_catalog: Source catalog connection.
|
|
1838
|
+
dst_catalog: Destination catalog connection.
|
|
1839
|
+
sname: Schema name.
|
|
1840
|
+
tname: Table name.
|
|
1841
|
+
reachable_rids: Set of RIDs to copy.
|
|
1842
|
+
page_size: Number of rows to fetch per request.
|
|
1843
|
+
report: Clone report for recording issues.
|
|
1844
|
+
truncate_oversized: Whether to truncate oversized values.
|
|
1845
|
+
|
|
1846
|
+
Returns:
|
|
1847
|
+
Tuple of (rows_copied, rows_skipped, skipped_rids, truncated_values).
|
|
1848
|
+
"""
|
|
1849
|
+
tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
|
|
1850
|
+
table_key = f"{sname}:{tname}"
|
|
1851
|
+
|
|
1852
|
+
MAX_INDEX_VALUE_BYTES = 2600
|
|
1853
|
+
TRUNCATE_SUFFIX = "...[TRUNCATED]"
|
|
1854
|
+
|
|
1855
|
+
rows_copied = 0
|
|
1856
|
+
rows_skipped = 0
|
|
1857
|
+
skipped_rids: list[str] = []
|
|
1858
|
+
truncated_values: list[TruncatedValue] = []
|
|
1859
|
+
|
|
1860
|
+
if not reachable_rids:
|
|
1861
|
+
return 0, 0, [], []
|
|
1862
|
+
|
|
1863
|
+
# Convert RIDs to sorted list for pagination
|
|
1864
|
+
rid_list = sorted(reachable_rids)
|
|
1865
|
+
|
|
1866
|
+
# Process in batches
|
|
1867
|
+
for i in range(0, len(rid_list), page_size):
|
|
1868
|
+
batch_rids = rid_list[i:i + page_size]
|
|
1869
|
+
|
|
1870
|
+
# Build query with RID filter
|
|
1871
|
+
rid_filter = ",".join(urlquote(rid) for rid in batch_rids)
|
|
1872
|
+
try:
|
|
1873
|
+
page = src_catalog.get(f"/entity/{tname_uri}/RID=any({rid_filter})").json()
|
|
1874
|
+
except Exception as e:
|
|
1875
|
+
logger.warning(f"Failed to fetch batch from {table_key}: {e}")
|
|
1876
|
+
rows_skipped += len(batch_rids)
|
|
1877
|
+
skipped_rids.extend(batch_rids)
|
|
1878
|
+
continue
|
|
1879
|
+
|
|
1880
|
+
if not page:
|
|
1881
|
+
continue
|
|
1882
|
+
|
|
1883
|
+
# Optionally truncate oversized values
|
|
1884
|
+
rows_to_insert = []
|
|
1885
|
+
for row in page:
|
|
1886
|
+
if truncate_oversized:
|
|
1887
|
+
modified_row = row.copy()
|
|
1888
|
+
for col, value in row.items():
|
|
1889
|
+
if isinstance(value, str):
|
|
1890
|
+
value_bytes = len(value.encode('utf-8'))
|
|
1891
|
+
if value_bytes > MAX_INDEX_VALUE_BYTES:
|
|
1892
|
+
max_chars = MAX_INDEX_VALUE_BYTES - len(TRUNCATE_SUFFIX.encode('utf-8'))
|
|
1893
|
+
truncated = value[:max_chars] + TRUNCATE_SUFFIX
|
|
1894
|
+
while len(truncated.encode('utf-8')) > MAX_INDEX_VALUE_BYTES:
|
|
1895
|
+
max_chars -= 100
|
|
1896
|
+
truncated = value[:max_chars] + TRUNCATE_SUFFIX
|
|
1897
|
+
modified_row[col] = truncated
|
|
1898
|
+
truncated_values.append(TruncatedValue(
|
|
1899
|
+
table=table_key,
|
|
1900
|
+
rid=str(row.get('RID', 'unknown')),
|
|
1901
|
+
column=col,
|
|
1902
|
+
original_bytes=value_bytes,
|
|
1903
|
+
truncated_bytes=len(truncated.encode('utf-8')),
|
|
1904
|
+
))
|
|
1905
|
+
rows_to_insert.append(modified_row)
|
|
1906
|
+
else:
|
|
1907
|
+
rows_to_insert.append(row)
|
|
1908
|
+
|
|
1909
|
+
# Insert into destination
|
|
1910
|
+
try:
|
|
1911
|
+
dst_catalog.post(f"/entity/{tname_uri}", json=rows_to_insert)
|
|
1912
|
+
rows_copied += len(rows_to_insert)
|
|
1913
|
+
except Exception as e:
|
|
1914
|
+
error_str = str(e)
|
|
1915
|
+
if "index row size" in error_str.lower() or "btree" in error_str.lower():
|
|
1916
|
+
# Row-by-row fallback for index size errors
|
|
1917
|
+
for row in rows_to_insert:
|
|
1918
|
+
try:
|
|
1919
|
+
dst_catalog.post(f"/entity/{tname_uri}", json=[row])
|
|
1920
|
+
rows_copied += 1
|
|
1921
|
+
except Exception:
|
|
1922
|
+
rows_skipped += 1
|
|
1923
|
+
skipped_rids.append(str(row.get('RID', 'unknown')))
|
|
1924
|
+
else:
|
|
1925
|
+
logger.warning(f"Failed to insert into {table_key}: {e}")
|
|
1926
|
+
rows_skipped += len(rows_to_insert)
|
|
1927
|
+
skipped_rids.extend(str(row.get('RID', 'unknown')) for row in rows_to_insert)
|
|
1928
|
+
|
|
1929
|
+
return rows_copied, rows_skipped, skipped_rids, truncated_values
|
|
1930
|
+
|
|
1931
|
+
|
|
949
1932
|
def clone_catalog(
|
|
950
1933
|
source_hostname: str,
|
|
951
1934
|
source_catalog_id: str,
|
|
@@ -1072,7 +2055,7 @@ def clone_catalog(
|
|
|
1072
2055
|
clone_timestamp = datetime.now(timezone.utc).isoformat()
|
|
1073
2056
|
|
|
1074
2057
|
# Perform the three-stage clone
|
|
1075
|
-
orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values = _clone_three_stage(
|
|
2058
|
+
orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, skipped_rids, truncated_values = _clone_three_stage(
|
|
1076
2059
|
src_catalog=src_catalog,
|
|
1077
2060
|
dst_catalog=dst_catalog,
|
|
1078
2061
|
copy_data=not schema_only,
|
|
@@ -1136,8 +2119,13 @@ def clone_catalog(
|
|
|
1136
2119
|
asset_mode=asset_mode.value,
|
|
1137
2120
|
exclude_schemas=exclude_schemas or [],
|
|
1138
2121
|
exclude_objects=exclude_objects or [],
|
|
2122
|
+
add_ml_schema=add_ml_schema,
|
|
2123
|
+
copy_annotations=copy_annotations,
|
|
2124
|
+
copy_policy=copy_policy,
|
|
2125
|
+
reinitialize_dataset_versions=reinitialize_dataset_versions,
|
|
1139
2126
|
rows_copied=total_rows_copied,
|
|
1140
2127
|
rows_skipped=rows_skipped,
|
|
2128
|
+
skipped_rids=skipped_rids,
|
|
1141
2129
|
truncated_count=len(truncated_values),
|
|
1142
2130
|
orphan_rows_removed=orphan_rows_removed,
|
|
1143
2131
|
orphan_rows_nullified=orphan_rows_nullified,
|
|
@@ -1186,10 +2174,10 @@ def _clone_three_stage(
|
|
|
1186
2174
|
prune_hidden_fkeys: bool,
|
|
1187
2175
|
truncate_oversized: bool,
|
|
1188
2176
|
report: CloneReport,
|
|
1189
|
-
) -> tuple[int, int, int, int, list[TruncatedValue]]:
|
|
2177
|
+
) -> tuple[int, int, int, int, list[str], list[TruncatedValue]]:
|
|
1190
2178
|
"""Perform three-stage catalog cloning.
|
|
1191
2179
|
|
|
1192
|
-
Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values)
|
|
2180
|
+
Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, skipped_rids, truncated_values)
|
|
1193
2181
|
"""
|
|
1194
2182
|
src_model = src_catalog.getCatalogModel()
|
|
1195
2183
|
|
|
@@ -1282,7 +2270,7 @@ def _clone_three_stage(
|
|
|
1282
2270
|
report.add_issue(CloneIssue(
|
|
1283
2271
|
severity=CloneIssueSeverity.WARNING,
|
|
1284
2272
|
category=CloneIssueCategory.FK_PRUNED,
|
|
1285
|
-
message=
|
|
2273
|
+
message="FK pruned due to hidden reference data",
|
|
1286
2274
|
table=f"{sname}:{tname}",
|
|
1287
2275
|
details=f"FK {fk_name} references columns with 'select': null",
|
|
1288
2276
|
action="Source catalog may have incoherent policies",
|
|
@@ -1328,6 +2316,7 @@ def _clone_three_stage(
|
|
|
1328
2316
|
# Stage 2: Copy data
|
|
1329
2317
|
total_rows = 0
|
|
1330
2318
|
total_rows_skipped = 0
|
|
2319
|
+
all_skipped_rids: list[str] = []
|
|
1331
2320
|
all_truncated_values: list[TruncatedValue] = []
|
|
1332
2321
|
deferred_indexes: dict[str, list[dict]] = {} # Track indexes dropped for later rebuild
|
|
1333
2322
|
|
|
@@ -1343,7 +2332,7 @@ def _clone_three_stage(
|
|
|
1343
2332
|
logger.debug(f"Copying data for {table_key}")
|
|
1344
2333
|
|
|
1345
2334
|
# Use the new copy function with index error handling
|
|
1346
|
-
table_rows, rows_skipped, truncated = _copy_table_data_with_retry(
|
|
2335
|
+
table_rows, rows_skipped, skipped_rids, truncated = _copy_table_data_with_retry(
|
|
1347
2336
|
src_catalog=src_catalog,
|
|
1348
2337
|
dst_catalog=dst_catalog,
|
|
1349
2338
|
sname=sname,
|
|
@@ -1355,6 +2344,7 @@ def _clone_three_stage(
|
|
|
1355
2344
|
)
|
|
1356
2345
|
|
|
1357
2346
|
total_rows_skipped += rows_skipped
|
|
2347
|
+
all_skipped_rids.extend(skipped_rids)
|
|
1358
2348
|
all_truncated_values.extend(truncated)
|
|
1359
2349
|
|
|
1360
2350
|
if table_rows < 0:
|
|
@@ -1581,7 +2571,7 @@ def _clone_three_stage(
|
|
|
1581
2571
|
if copy_annotations or copy_policy:
|
|
1582
2572
|
_copy_configuration(src_model, dst_catalog, copy_annotations, copy_policy, exclude_schemas, excluded_tables)
|
|
1583
2573
|
|
|
1584
|
-
return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_truncated_values
|
|
2574
|
+
return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_skipped_rids, all_truncated_values
|
|
1585
2575
|
|
|
1586
2576
|
|
|
1587
2577
|
def _identify_orphan_values(
|
|
@@ -1684,7 +2674,7 @@ def _delete_orphan_rows(
|
|
|
1684
2674
|
report.add_issue(CloneIssue(
|
|
1685
2675
|
severity=CloneIssueSeverity.WARNING,
|
|
1686
2676
|
category=CloneIssueCategory.ORPHAN_ROWS,
|
|
1687
|
-
message=
|
|
2677
|
+
message="Orphan rows deleted",
|
|
1688
2678
|
table=table_key,
|
|
1689
2679
|
details=f"Missing references to: {ref_key} ({len(orphan_values)})",
|
|
1690
2680
|
action="Source catalog may have incoherent row-level policies",
|
|
@@ -1892,10 +2882,27 @@ def _post_clone_operations(
|
|
|
1892
2882
|
|
|
1893
2883
|
if add_ml_schema:
|
|
1894
2884
|
try:
|
|
1895
|
-
from deriva_ml.schema import create_ml_schema
|
|
1896
2885
|
catalog = server.connect_ermrest(result.catalog_id)
|
|
1897
2886
|
create_ml_schema(catalog)
|
|
1898
2887
|
result.ml_schema_added = True
|
|
2888
|
+
|
|
2889
|
+
# Apply catalog annotations (chaise-config, navbar, etc.)
|
|
2890
|
+
# Import DerivaML locally to avoid circular import (deriva_ml.__init__ imports from clone.py)
|
|
2891
|
+
try:
|
|
2892
|
+
from deriva_ml import DerivaML
|
|
2893
|
+
ml = DerivaML(result.hostname, result.catalog_id, check_auth=False)
|
|
2894
|
+
ml.apply_catalog_annotations()
|
|
2895
|
+
logger.info("Applied catalog annotations (chaise-config, navbar)")
|
|
2896
|
+
except Exception as e:
|
|
2897
|
+
logger.warning(f"Failed to apply catalog annotations: {e}")
|
|
2898
|
+
if result.report:
|
|
2899
|
+
result.report.add_issue(CloneIssue(
|
|
2900
|
+
severity=CloneIssueSeverity.WARNING,
|
|
2901
|
+
category=CloneIssueCategory.SCHEMA_ISSUE,
|
|
2902
|
+
message="Failed to apply catalog annotations",
|
|
2903
|
+
details=str(e),
|
|
2904
|
+
action="Manually call apply_catalog_annotations() after clone",
|
|
2905
|
+
))
|
|
1899
2906
|
except Exception as e:
|
|
1900
2907
|
logger.warning(f"Failed to add ML schema: {e}")
|
|
1901
2908
|
if result.report:
|
|
@@ -1945,3 +2952,549 @@ def _reinitialize_dataset_versions(
|
|
|
1945
2952
|
logger.warning(f"Failed to reinitialize dataset versions: {e}")
|
|
1946
2953
|
|
|
1947
2954
|
return result
|
|
2955
|
+
|
|
2956
|
+
|
|
2957
|
+
# =============================================================================
|
|
2958
|
+
# Clone Subset Catalog
|
|
2959
|
+
# =============================================================================
|
|
2960
|
+
|
|
2961
|
+
|
|
2962
|
+
def clone_subset_catalog(
|
|
2963
|
+
source_hostname: str,
|
|
2964
|
+
source_catalog_id: str,
|
|
2965
|
+
root_rid: str,
|
|
2966
|
+
include_tables: list[str] | None = None,
|
|
2967
|
+
exclude_objects: list[str] | None = None,
|
|
2968
|
+
exclude_schemas: list[str] | None = None,
|
|
2969
|
+
include_associations: bool = True,
|
|
2970
|
+
include_vocabularies: bool = True,
|
|
2971
|
+
use_export_annotation: bool = False,
|
|
2972
|
+
dest_hostname: str | None = None,
|
|
2973
|
+
alias: str | None = None,
|
|
2974
|
+
add_ml_schema: bool = False,
|
|
2975
|
+
asset_mode: AssetCopyMode = AssetCopyMode.REFERENCES,
|
|
2976
|
+
copy_annotations: bool = True,
|
|
2977
|
+
copy_policy: bool = True,
|
|
2978
|
+
source_credential: dict | None = None,
|
|
2979
|
+
dest_credential: dict | None = None,
|
|
2980
|
+
orphan_strategy: OrphanStrategy = OrphanStrategy.FAIL,
|
|
2981
|
+
prune_hidden_fkeys: bool = False,
|
|
2982
|
+
truncate_oversized: bool = False,
|
|
2983
|
+
reinitialize_dataset_versions: bool = True,
|
|
2984
|
+
) -> CloneCatalogResult:
|
|
2985
|
+
"""Clone a subset of a catalog containing only data reachable from a root RID.
|
|
2986
|
+
|
|
2987
|
+
Can use either FK graph traversal or export annotations to determine which
|
|
2988
|
+
tables to include. When use_export_annotation=True, the tables and paths
|
|
2989
|
+
defined in the root table's export annotation (tag:isrd.isi.edu,2019:export)
|
|
2990
|
+
are used, which matches the behavior of the BDBag export button.
|
|
2991
|
+
|
|
2992
|
+
Args:
|
|
2993
|
+
source_hostname: Hostname of the source catalog server.
|
|
2994
|
+
source_catalog_id: ID of the catalog to clone from.
|
|
2995
|
+
root_rid: The starting RID from which to trace reachability.
|
|
2996
|
+
include_tables: Optional list of additional table names ("schema:table"
|
|
2997
|
+
format) to use as starting points for table discovery. If None,
|
|
2998
|
+
discovery starts only from the root RID's table.
|
|
2999
|
+
exclude_objects: List of tables ("schema:table" format) to exclude from
|
|
3000
|
+
cloning. Paths through these tables are not followed.
|
|
3001
|
+
exclude_schemas: List of schema names to exclude entirely from cloning.
|
|
3002
|
+
include_associations: If True, auto-include association tables needed
|
|
3003
|
+
for FK integrity between discovered tables.
|
|
3004
|
+
include_vocabularies: If True, auto-include vocabulary tables referenced
|
|
3005
|
+
by discovered tables.
|
|
3006
|
+
use_export_annotation: If True, use the export annotation on the root
|
|
3007
|
+
table to determine which tables and paths to clone. This matches the
|
|
3008
|
+
behavior of the BDBag export button. If False (default), discover
|
|
3009
|
+
tables via FK graph traversal.
|
|
3010
|
+
dest_hostname: Destination hostname. If None, uses source_hostname.
|
|
3011
|
+
alias: Optional alias for the new catalog.
|
|
3012
|
+
add_ml_schema: If True, add DerivaML schema to clone.
|
|
3013
|
+
asset_mode: How to handle assets (NONE, REFERENCES, or FULL).
|
|
3014
|
+
copy_annotations: If True, copy annotations to clone.
|
|
3015
|
+
copy_policy: If True, copy ACLs/ACL bindings to clone.
|
|
3016
|
+
source_credential: Credentials for source catalog.
|
|
3017
|
+
dest_credential: Credentials for destination catalog.
|
|
3018
|
+
orphan_strategy: How to handle orphan rows (FAIL, DELETE, or NULLIFY).
|
|
3019
|
+
prune_hidden_fkeys: If True, prune FKs with hidden reference data.
|
|
3020
|
+
truncate_oversized: If True, truncate values that exceed index size limits.
|
|
3021
|
+
reinitialize_dataset_versions: If True, reinitialize dataset versions.
|
|
3022
|
+
|
|
3023
|
+
Returns:
|
|
3024
|
+
CloneCatalogResult with details of the cloned catalog.
|
|
3025
|
+
|
|
3026
|
+
Raises:
|
|
3027
|
+
ValueError: If root_rid is not found in any table.
|
|
3028
|
+
ValueError: If include_tables contains invalid table specifications.
|
|
3029
|
+
ValueError: If use_export_annotation=True but no export annotation found.
|
|
3030
|
+
|
|
3031
|
+
Example:
|
|
3032
|
+
>>> # Clone using export annotation (matches BDBag export)
|
|
3033
|
+
>>> result = clone_subset_catalog(
|
|
3034
|
+
... source_hostname="www.facebase.org",
|
|
3035
|
+
... source_catalog_id="1",
|
|
3036
|
+
... root_rid="3-HXMC",
|
|
3037
|
+
... use_export_annotation=True,
|
|
3038
|
+
... alias="my-project-clone",
|
|
3039
|
+
... )
|
|
3040
|
+
|
|
3041
|
+
>>> # Clone all tables reachable from a dataset, excluding Execution table
|
|
3042
|
+
>>> result = clone_subset_catalog(
|
|
3043
|
+
... source_hostname="www.example.org",
|
|
3044
|
+
... source_catalog_id="1",
|
|
3045
|
+
... root_rid="ABC123",
|
|
3046
|
+
... exclude_objects=["deriva-ml:Execution"],
|
|
3047
|
+
... alias="my-subset",
|
|
3048
|
+
... )
|
|
3049
|
+
>>> print(f"Created catalog {result.catalog_id}")
|
|
3050
|
+
|
|
3051
|
+
>>> # Clone with additional starting tables
|
|
3052
|
+
>>> result = clone_subset_catalog(
|
|
3053
|
+
... source_hostname="www.example.org",
|
|
3054
|
+
... source_catalog_id="1",
|
|
3055
|
+
... root_rid="ABC123",
|
|
3056
|
+
... include_tables=["demo:Configuration"], # Also discover from here
|
|
3057
|
+
... exclude_schemas=["audit"],
|
|
3058
|
+
... )
|
|
3059
|
+
"""
|
|
3060
|
+
include_tables = include_tables or []
|
|
3061
|
+
exclude_objects = exclude_objects or []
|
|
3062
|
+
exclude_schemas_set = set(exclude_schemas) if exclude_schemas else set()
|
|
3063
|
+
|
|
3064
|
+
# Validate table format for include_tables
|
|
3065
|
+
for table_spec in include_tables:
|
|
3066
|
+
if ":" not in table_spec:
|
|
3067
|
+
raise ValueError(f"Table must be specified as 'schema:table', got: {table_spec}")
|
|
3068
|
+
|
|
3069
|
+
# Parse exclude_objects into set of tuples
|
|
3070
|
+
excluded_tables: set[tuple[str, str]] = set()
|
|
3071
|
+
for table_spec in exclude_objects:
|
|
3072
|
+
if ":" not in table_spec:
|
|
3073
|
+
raise ValueError(f"exclude_objects entries must be 'schema:table', got: {table_spec}")
|
|
3074
|
+
schema, table = table_spec.split(":", 1)
|
|
3075
|
+
excluded_tables.add((schema, table))
|
|
3076
|
+
|
|
3077
|
+
dest_hostname = dest_hostname or source_hostname
|
|
3078
|
+
|
|
3079
|
+
# Get credentials
|
|
3080
|
+
src_cred = source_credential or get_credential(source_hostname)
|
|
3081
|
+
dst_cred = dest_credential or get_credential(dest_hostname)
|
|
3082
|
+
|
|
3083
|
+
# Connect to source catalog
|
|
3084
|
+
src_server = DerivaServer("https", source_hostname, credentials=src_cred)
|
|
3085
|
+
src_catalog = src_server.connect_ermrest(source_catalog_id)
|
|
3086
|
+
src_model = src_catalog.getCatalogModel()
|
|
3087
|
+
|
|
3088
|
+
logger.info(f"Connected to source catalog {source_hostname}/{source_catalog_id}")
|
|
3089
|
+
|
|
3090
|
+
# First, find the table containing the root RID
|
|
3091
|
+
root_table_key = None
|
|
3092
|
+
for sname, schema in src_model.schemas.items():
|
|
3093
|
+
if sname in {"public", "_acl_admin", "WWW"} or sname in exclude_schemas_set:
|
|
3094
|
+
continue
|
|
3095
|
+
for tname, table in schema.tables.items():
|
|
3096
|
+
if (sname, tname) in excluded_tables:
|
|
3097
|
+
continue
|
|
3098
|
+
if table.kind != 'table' or 'RID' not in table.column_definitions.elements:
|
|
3099
|
+
continue
|
|
3100
|
+
try:
|
|
3101
|
+
table_spec = f"{sname}:{tname}"
|
|
3102
|
+
uri = f"/entity/{_quote_table_spec(table_spec)}/RID={urlquote(root_rid)}"
|
|
3103
|
+
result = src_catalog.get(uri).json()
|
|
3104
|
+
if result:
|
|
3105
|
+
root_table_key = table_spec
|
|
3106
|
+
break
|
|
3107
|
+
except Exception:
|
|
3108
|
+
continue
|
|
3109
|
+
if root_table_key:
|
|
3110
|
+
break
|
|
3111
|
+
|
|
3112
|
+
if root_table_key is None:
|
|
3113
|
+
raise ValueError(f"Root RID {root_rid} not found in any accessible table")
|
|
3114
|
+
|
|
3115
|
+
logger.info(f"Root RID {root_rid} found in table {root_table_key}")
|
|
3116
|
+
|
|
3117
|
+
# Get the root table object for export annotation parsing
|
|
3118
|
+
root_schema, root_tname = root_table_key.split(":", 1)
|
|
3119
|
+
root_table_obj = src_model.schemas[root_schema].tables[root_tname]
|
|
3120
|
+
|
|
3121
|
+
# Track paths for efficient RID computation (when using export annotation)
|
|
3122
|
+
export_paths: list[list[str]] = []
|
|
3123
|
+
|
|
3124
|
+
if use_export_annotation:
|
|
3125
|
+
# Use export annotation to determine tables
|
|
3126
|
+
logger.info("Using export annotation to determine tables...")
|
|
3127
|
+
discovered_tables, export_paths = _parse_export_annotation_tables(
|
|
3128
|
+
root_table_obj, []
|
|
3129
|
+
)
|
|
3130
|
+
|
|
3131
|
+
if not discovered_tables or len(discovered_tables) <= 1:
|
|
3132
|
+
raise ValueError(
|
|
3133
|
+
f"No export annotation found on table {root_table_key} or annotation "
|
|
3134
|
+
f"contains no paths. Set use_export_annotation=False to use FK graph traversal."
|
|
3135
|
+
)
|
|
3136
|
+
|
|
3137
|
+
logger.info(f"Export annotation defines {len(discovered_tables)} tables and {len(export_paths)} paths")
|
|
3138
|
+
|
|
3139
|
+
# Add any explicitly included tables
|
|
3140
|
+
for table_spec in (include_tables or []):
|
|
3141
|
+
if table_spec not in discovered_tables:
|
|
3142
|
+
discovered_tables.append(table_spec)
|
|
3143
|
+
|
|
3144
|
+
# Filter out excluded tables
|
|
3145
|
+
discovered_tables = [
|
|
3146
|
+
t for t in discovered_tables
|
|
3147
|
+
if tuple(t.split(":", 1)) not in excluded_tables
|
|
3148
|
+
]
|
|
3149
|
+
else:
|
|
3150
|
+
# Build starting tables: root table + any explicitly included tables
|
|
3151
|
+
start_tables = [root_table_key]
|
|
3152
|
+
for table_spec in include_tables:
|
|
3153
|
+
if table_spec not in start_tables:
|
|
3154
|
+
start_tables.append(table_spec)
|
|
3155
|
+
|
|
3156
|
+
# Discover all reachable tables from starting points using FK traversal
|
|
3157
|
+
logger.info(f"Discovering tables reachable from {start_tables}...")
|
|
3158
|
+
discovered_tables = _discover_reachable_tables(
|
|
3159
|
+
model=src_model,
|
|
3160
|
+
start_tables=start_tables,
|
|
3161
|
+
exclude_tables=excluded_tables,
|
|
3162
|
+
exclude_schemas=exclude_schemas_set,
|
|
3163
|
+
)
|
|
3164
|
+
|
|
3165
|
+
logger.info(f"Discovered {len(discovered_tables)} connected tables")
|
|
3166
|
+
|
|
3167
|
+
# Expand with associations and vocabularies
|
|
3168
|
+
all_tables = list(discovered_tables)
|
|
3169
|
+
associations_added: list[str] = []
|
|
3170
|
+
vocabularies_added: list[str] = []
|
|
3171
|
+
|
|
3172
|
+
if include_associations:
|
|
3173
|
+
all_tables, associations_added = _expand_tables_with_associations(src_model, all_tables)
|
|
3174
|
+
# Filter out excluded tables from associations
|
|
3175
|
+
associations_added = [
|
|
3176
|
+
t for t in associations_added
|
|
3177
|
+
if tuple(t.split(":", 1)) not in excluded_tables
|
|
3178
|
+
]
|
|
3179
|
+
all_tables = [
|
|
3180
|
+
t for t in all_tables
|
|
3181
|
+
if tuple(t.split(":", 1)) not in excluded_tables
|
|
3182
|
+
]
|
|
3183
|
+
if associations_added:
|
|
3184
|
+
logger.info(f"Auto-added association tables: {associations_added}")
|
|
3185
|
+
|
|
3186
|
+
if include_vocabularies:
|
|
3187
|
+
all_tables, vocabularies_added = _expand_tables_with_vocabularies(src_model, all_tables)
|
|
3188
|
+
# Filter out excluded tables from vocabularies
|
|
3189
|
+
vocabularies_added = [
|
|
3190
|
+
t for t in vocabularies_added
|
|
3191
|
+
if tuple(t.split(":", 1)) not in excluded_tables
|
|
3192
|
+
]
|
|
3193
|
+
all_tables = [
|
|
3194
|
+
t for t in all_tables
|
|
3195
|
+
if tuple(t.split(":", 1)) not in excluded_tables
|
|
3196
|
+
]
|
|
3197
|
+
if vocabularies_added:
|
|
3198
|
+
logger.info(f"Auto-added vocabulary tables: {vocabularies_added}")
|
|
3199
|
+
|
|
3200
|
+
logger.info(f"Will clone {len(all_tables)} tables: {all_tables}")
|
|
3201
|
+
|
|
3202
|
+
# Compute reachable RIDs
|
|
3203
|
+
logger.info(f"Computing reachable rows from root RID {root_rid}...")
|
|
3204
|
+
|
|
3205
|
+
if use_export_annotation and export_paths:
|
|
3206
|
+
# Use the predefined paths from export annotation (more efficient)
|
|
3207
|
+
# Also pass model to enable FK reference expansion
|
|
3208
|
+
reachable_rids = _compute_reachable_rids_from_paths(
|
|
3209
|
+
src_catalog, root_rid, root_table_key, export_paths, all_tables, src_model
|
|
3210
|
+
)
|
|
3211
|
+
else:
|
|
3212
|
+
# Use FK graph traversal
|
|
3213
|
+
reachable_rids = _compute_reachable_rids(src_catalog, root_rid, all_tables, src_model)
|
|
3214
|
+
|
|
3215
|
+
total_rows = sum(len(rids) for rids in reachable_rids.values())
|
|
3216
|
+
logger.info(f"Found {total_rows} reachable rows across {len(all_tables)} tables")
|
|
3217
|
+
|
|
3218
|
+
for table_spec, rids in reachable_rids.items():
|
|
3219
|
+
if rids:
|
|
3220
|
+
logger.debug(f" {table_spec}: {len(rids)} rows")
|
|
3221
|
+
|
|
3222
|
+
# Create report
|
|
3223
|
+
report = CloneReport()
|
|
3224
|
+
|
|
3225
|
+
# Parse tables into set for quick lookup
|
|
3226
|
+
included_tables: set[tuple[str, str]] = set()
|
|
3227
|
+
for table_spec in all_tables:
|
|
3228
|
+
schema, table = table_spec.split(":", 1)
|
|
3229
|
+
included_tables.add((schema, table))
|
|
3230
|
+
|
|
3231
|
+
# Create destination catalog
|
|
3232
|
+
dst_server = DerivaServer("https", dest_hostname, credentials=dst_cred)
|
|
3233
|
+
dst_catalog = dst_server.create_ermrest_catalog()
|
|
3234
|
+
dst_catalog_id = dst_catalog.catalog_id
|
|
3235
|
+
|
|
3236
|
+
logger.info(f"Created destination catalog {dest_hostname}/{dst_catalog_id}")
|
|
3237
|
+
|
|
3238
|
+
try:
|
|
3239
|
+
# Build model content for included tables only
|
|
3240
|
+
new_model = []
|
|
3241
|
+
fkeys_deferred = []
|
|
3242
|
+
clone_states = {}
|
|
3243
|
+
|
|
3244
|
+
def prune_parts(d, *extra_victims):
|
|
3245
|
+
victims = set(extra_victims)
|
|
3246
|
+
if not copy_annotations:
|
|
3247
|
+
victims |= {'annotations'}
|
|
3248
|
+
if not copy_policy:
|
|
3249
|
+
victims |= {'acls', 'acl_bindings'}
|
|
3250
|
+
for k in victims:
|
|
3251
|
+
d.pop(k, None)
|
|
3252
|
+
return d
|
|
3253
|
+
|
|
3254
|
+
# Collect schemas that have included tables
|
|
3255
|
+
included_schemas: set[str] = {schema for schema, _ in included_tables}
|
|
3256
|
+
|
|
3257
|
+
for sname in included_schemas:
|
|
3258
|
+
if sname not in src_model.schemas:
|
|
3259
|
+
continue
|
|
3260
|
+
|
|
3261
|
+
schema = src_model.schemas[sname]
|
|
3262
|
+
schema_def = prune_parts(schema.prejson(), 'tables')
|
|
3263
|
+
new_model.append(schema_def)
|
|
3264
|
+
|
|
3265
|
+
for tname, table in schema.tables.items():
|
|
3266
|
+
if (sname, tname) not in included_tables:
|
|
3267
|
+
continue
|
|
3268
|
+
|
|
3269
|
+
if table.kind != 'table':
|
|
3270
|
+
continue
|
|
3271
|
+
|
|
3272
|
+
if 'RID' not in table.column_definitions.elements:
|
|
3273
|
+
logger.warning(f"Table {sname}.{tname} lacks system columns, skipping")
|
|
3274
|
+
report.tables_skipped.append(f"{sname}:{tname}")
|
|
3275
|
+
continue
|
|
3276
|
+
|
|
3277
|
+
# Create table definition without FKs
|
|
3278
|
+
table_def = prune_parts(table.prejson(), 'foreign_keys')
|
|
3279
|
+
table_def['column_definitions'] = [
|
|
3280
|
+
prune_parts(c) for c in table_def['column_definitions']
|
|
3281
|
+
]
|
|
3282
|
+
table_def['keys'] = [prune_parts(k) for k in table_def.get('keys', [])]
|
|
3283
|
+
table_def.setdefault('annotations', {})[_clone_state_url] = 1
|
|
3284
|
+
|
|
3285
|
+
new_model.append(table_def)
|
|
3286
|
+
clone_states[(sname, tname)] = 1
|
|
3287
|
+
|
|
3288
|
+
# Collect FKs (only those between included tables)
|
|
3289
|
+
for fkdef in table.prejson().get('foreign_keys', []):
|
|
3290
|
+
include_fk = True
|
|
3291
|
+
for ref_col in fkdef.get('referenced_columns', []):
|
|
3292
|
+
ref_schema = ref_col.get('schema_name')
|
|
3293
|
+
ref_table = ref_col.get('table_name')
|
|
3294
|
+
if (ref_schema, ref_table) not in included_tables:
|
|
3295
|
+
include_fk = False
|
|
3296
|
+
break
|
|
3297
|
+
|
|
3298
|
+
if include_fk:
|
|
3299
|
+
fkeys_deferred.append((sname, tname, prune_parts(fkdef.copy())))
|
|
3300
|
+
|
|
3301
|
+
# Stage 1: Create schema without FKs
|
|
3302
|
+
logger.info("Stage 1: Creating schema without foreign keys...")
|
|
3303
|
+
if new_model:
|
|
3304
|
+
dst_catalog.post("/schema", json=new_model)
|
|
3305
|
+
|
|
3306
|
+
# Stage 2: Copy filtered data
|
|
3307
|
+
logger.info("Stage 2: Copying filtered data...")
|
|
3308
|
+
total_rows_copied = 0
|
|
3309
|
+
total_rows_skipped = 0
|
|
3310
|
+
all_skipped_rids: list[str] = []
|
|
3311
|
+
all_truncated_values: list[TruncatedValue] = []
|
|
3312
|
+
page_size = 1000
|
|
3313
|
+
|
|
3314
|
+
if use_export_annotation and export_paths:
|
|
3315
|
+
# Use path-based copying to respect row-level security
|
|
3316
|
+
logger.info("Using path-based copying (respects row-level ACLs)...")
|
|
3317
|
+
rows_by_table = _copy_data_via_export_paths(
|
|
3318
|
+
src_catalog=src_catalog,
|
|
3319
|
+
dst_catalog=dst_catalog,
|
|
3320
|
+
root_table=root_table_key,
|
|
3321
|
+
root_rid=root_rid,
|
|
3322
|
+
export_paths=export_paths,
|
|
3323
|
+
all_tables=all_tables,
|
|
3324
|
+
report=report,
|
|
3325
|
+
truncate_oversized=truncate_oversized,
|
|
3326
|
+
page_size=page_size,
|
|
3327
|
+
)
|
|
3328
|
+
for table_key, rows in rows_by_table.items():
|
|
3329
|
+
report.tables_restored[table_key] = rows
|
|
3330
|
+
total_rows_copied += rows
|
|
3331
|
+
|
|
3332
|
+
# Mark all tables complete
|
|
3333
|
+
for (sname, tname), state in clone_states.items():
|
|
3334
|
+
if state == 1:
|
|
3335
|
+
try:
|
|
3336
|
+
dst_catalog.put(
|
|
3337
|
+
f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/annotation/{urlquote(_clone_state_url)}",
|
|
3338
|
+
json=2
|
|
3339
|
+
)
|
|
3340
|
+
except Exception:
|
|
3341
|
+
pass
|
|
3342
|
+
else:
|
|
3343
|
+
# Use RID-based copying (original approach)
|
|
3344
|
+
for (sname, tname), state in clone_states.items():
|
|
3345
|
+
if state != 1:
|
|
3346
|
+
continue
|
|
3347
|
+
|
|
3348
|
+
table_key = f"{sname}:{tname}"
|
|
3349
|
+
table_reachable = reachable_rids.get(table_key, set())
|
|
3350
|
+
|
|
3351
|
+
if not table_reachable:
|
|
3352
|
+
logger.debug(f"No reachable rows for {table_key}")
|
|
3353
|
+
report.tables_restored[table_key] = 0
|
|
3354
|
+
continue
|
|
3355
|
+
|
|
3356
|
+
logger.debug(f"Copying {len(table_reachable)} rows for {table_key}")
|
|
3357
|
+
|
|
3358
|
+
rows_copied, rows_skipped, skipped, truncated = _copy_subset_table_data(
|
|
3359
|
+
src_catalog=src_catalog,
|
|
3360
|
+
dst_catalog=dst_catalog,
|
|
3361
|
+
sname=sname,
|
|
3362
|
+
tname=tname,
|
|
3363
|
+
reachable_rids=table_reachable,
|
|
3364
|
+
page_size=page_size,
|
|
3365
|
+
report=report,
|
|
3366
|
+
truncate_oversized=truncate_oversized,
|
|
3367
|
+
)
|
|
3368
|
+
|
|
3369
|
+
total_rows_copied += rows_copied
|
|
3370
|
+
total_rows_skipped += rows_skipped
|
|
3371
|
+
all_skipped_rids.extend(skipped)
|
|
3372
|
+
all_truncated_values.extend(truncated)
|
|
3373
|
+
|
|
3374
|
+
report.tables_restored[table_key] = rows_copied
|
|
3375
|
+
|
|
3376
|
+
# Mark complete
|
|
3377
|
+
try:
|
|
3378
|
+
dst_catalog.put(
|
|
3379
|
+
f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/annotation/{urlquote(_clone_state_url)}",
|
|
3380
|
+
json=2
|
|
3381
|
+
)
|
|
3382
|
+
except Exception:
|
|
3383
|
+
pass
|
|
3384
|
+
|
|
3385
|
+
logger.info(f"Copied {total_rows_copied} rows, skipped {total_rows_skipped}")
|
|
3386
|
+
|
|
3387
|
+
# Stage 3: Apply FKs
|
|
3388
|
+
logger.info(f"Stage 3: Applying {len(fkeys_deferred)} foreign keys...")
|
|
3389
|
+
fkeys_applied = 0
|
|
3390
|
+
fkeys_failed = 0
|
|
3391
|
+
|
|
3392
|
+
for sname, tname, fkdef in fkeys_deferred:
|
|
3393
|
+
fk_name = fkdef.get('names', [[sname, 'unknown']])[0]
|
|
3394
|
+
try:
|
|
3395
|
+
dst_catalog.post(
|
|
3396
|
+
f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/foreignkey",
|
|
3397
|
+
json=fkdef
|
|
3398
|
+
)
|
|
3399
|
+
fkeys_applied += 1
|
|
3400
|
+
report.fkeys_applied += 1
|
|
3401
|
+
except Exception as e:
|
|
3402
|
+
error_str = str(e)
|
|
3403
|
+
if "violates foreign key constraint" in error_str:
|
|
3404
|
+
if orphan_strategy == OrphanStrategy.FAIL:
|
|
3405
|
+
report.add_issue(CloneIssue(
|
|
3406
|
+
severity=CloneIssueSeverity.ERROR,
|
|
3407
|
+
category=CloneIssueCategory.FK_VIOLATION,
|
|
3408
|
+
message="FK constraint violation",
|
|
3409
|
+
table=f"{sname}:{tname}",
|
|
3410
|
+
details=f"FK {fk_name}: {error_str[:200]}",
|
|
3411
|
+
action="Some reachable rows may have dangling references",
|
|
3412
|
+
))
|
|
3413
|
+
fkeys_failed += 1
|
|
3414
|
+
report.fkeys_failed += 1
|
|
3415
|
+
else:
|
|
3416
|
+
logger.warning(f"Failed to apply FK {fk_name}: {e}")
|
|
3417
|
+
fkeys_failed += 1
|
|
3418
|
+
report.fkeys_failed += 1
|
|
3419
|
+
|
|
3420
|
+
logger.info(f"Applied {fkeys_applied} FKs, failed {fkeys_failed}")
|
|
3421
|
+
|
|
3422
|
+
# Build result
|
|
3423
|
+
result = CloneCatalogResult(
|
|
3424
|
+
catalog_id=dst_catalog_id,
|
|
3425
|
+
hostname=dest_hostname,
|
|
3426
|
+
schema_only=False,
|
|
3427
|
+
asset_mode=asset_mode,
|
|
3428
|
+
source_hostname=source_hostname,
|
|
3429
|
+
source_catalog_id=source_catalog_id,
|
|
3430
|
+
source_snapshot=None,
|
|
3431
|
+
alias=alias,
|
|
3432
|
+
orphan_rows_removed=0,
|
|
3433
|
+
orphan_rows_nullified=0,
|
|
3434
|
+
fkeys_pruned=0,
|
|
3435
|
+
rows_skipped=total_rows_skipped,
|
|
3436
|
+
truncated_values=all_truncated_values,
|
|
3437
|
+
report=report,
|
|
3438
|
+
)
|
|
3439
|
+
|
|
3440
|
+
# Post-clone operations
|
|
3441
|
+
if alias:
|
|
3442
|
+
try:
|
|
3443
|
+
dst_server.create_ermrest_alias(id=alias, alias_target=str(dst_catalog_id))
|
|
3444
|
+
result.alias = alias
|
|
3445
|
+
except Exception as e:
|
|
3446
|
+
logger.warning(f"Failed to create alias '{alias}': {e}")
|
|
3447
|
+
|
|
3448
|
+
if add_ml_schema:
|
|
3449
|
+
try:
|
|
3450
|
+
create_ml_schema(dst_catalog)
|
|
3451
|
+
result.ml_schema_added = True
|
|
3452
|
+
|
|
3453
|
+
# Apply catalog annotations (chaise-config, navbar, etc.)
|
|
3454
|
+
# Import DerivaML locally to avoid circular import (deriva_ml.__init__ imports from clone.py)
|
|
3455
|
+
try:
|
|
3456
|
+
from deriva_ml import DerivaML
|
|
3457
|
+
ml = DerivaML(dest_hostname, str(dst_catalog_id), check_auth=False)
|
|
3458
|
+
ml.apply_catalog_annotations()
|
|
3459
|
+
logger.info("Applied catalog annotations (chaise-config, navbar)")
|
|
3460
|
+
except Exception as e:
|
|
3461
|
+
logger.warning(f"Failed to apply catalog annotations: {e}")
|
|
3462
|
+
except Exception as e:
|
|
3463
|
+
logger.warning(f"Failed to add ML schema: {e}")
|
|
3464
|
+
|
|
3465
|
+
if reinitialize_dataset_versions and "deriva-ml" in src_model.schemas:
|
|
3466
|
+
result = _reinitialize_dataset_versions(result, dst_cred)
|
|
3467
|
+
|
|
3468
|
+
# Set defaultTable to the root table for partial clones
|
|
3469
|
+
# This ensures the Chaise UI has a valid landing page
|
|
3470
|
+
try:
|
|
3471
|
+
chaise_config_url = "tag:isrd.isi.edu,2019:chaise-config"
|
|
3472
|
+
dst_model = dst_catalog.getCatalogModel()
|
|
3473
|
+
dst_model.annotations[chaise_config_url] = dst_model.annotations.get(chaise_config_url, {})
|
|
3474
|
+
# Chaise expects defaultTable as an object with schema and table keys
|
|
3475
|
+
root_schema, root_tname = root_table_key.split(":", 1)
|
|
3476
|
+
dst_model.annotations[chaise_config_url]["defaultTable"] = {
|
|
3477
|
+
"schema": root_schema,
|
|
3478
|
+
"table": root_tname,
|
|
3479
|
+
}
|
|
3480
|
+
dst_model.apply()
|
|
3481
|
+
logger.info(f"Set defaultTable to {root_table_key}")
|
|
3482
|
+
except Exception as e:
|
|
3483
|
+
logger.warning(f"Failed to set defaultTable annotation: {e}")
|
|
3484
|
+
|
|
3485
|
+
logger.info(
|
|
3486
|
+
f"Subset clone complete: {dest_hostname}/{dst_catalog_id} "
|
|
3487
|
+
f"({total_rows_copied} rows in {len(clone_states)} tables)"
|
|
3488
|
+
)
|
|
3489
|
+
|
|
3490
|
+
return result
|
|
3491
|
+
|
|
3492
|
+
except Exception as e:
|
|
3493
|
+
# Clean up on failure
|
|
3494
|
+
logger.error(f"Clone failed: {e}")
|
|
3495
|
+
try:
|
|
3496
|
+
dst_server.delete_ermrest_catalog(dst_catalog_id)
|
|
3497
|
+
logger.info(f"Cleaned up failed catalog {dst_catalog_id}")
|
|
3498
|
+
except Exception:
|
|
3499
|
+
pass
|
|
3500
|
+
raise
|