deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +69 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +31 -0
  7. deriva_ml/catalog/clone.py +1939 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +845 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.12.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1939 @@
1
+ """Enhanced catalog cloning with cross-server and selective asset support.
2
+
3
+ This module provides catalog cloning that handles the common case of incoherent
4
+ row-level policies in source catalogs. When source policies hide some domain
5
+ table rows but don't hide the referring rows, foreign key violations occur
6
+ during cloning.
7
+
8
+ The solution uses a three-stage approach:
9
+ 1. Create schema WITHOUT foreign keys
10
+ 2. Copy all data
11
+ 3. Apply foreign keys, handling violations by either:
12
+ - Deleting orphan rows (rows with dangling FK references)
13
+ - Nullifying references (setting dangling FK values to NULL)
14
+
15
+ This approach is more robust than trying to pre-filter data, as it handles
16
+ all edge cases including circular dependencies and complex FK relationships.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import logging
23
+ from dataclasses import dataclass, field
24
+ from datetime import datetime, timezone
25
+ from enum import Enum
26
+ from typing import Any
27
+ from urllib.parse import quote as urlquote
28
+
29
+ from deriva.core import DerivaServer, ErmrestCatalog, get_credential
30
+ from deriva.core.hatrac_store import HatracStore
31
+
32
+ logger = logging.getLogger("deriva_ml")
33
+
34
+
35
+ class CloneIssueSeverity(Enum):
36
+ """Severity level of clone issues."""
37
+
38
+ INFO = "info"
39
+ WARNING = "warning"
40
+ ERROR = "error"
41
+ CRITICAL = "critical"
42
+
43
+
44
+ class CloneIssueCategory(Enum):
45
+ """Category of clone issues."""
46
+
47
+ ACCESS_DENIED = "access_denied"
48
+ ORPHAN_ROWS = "orphan_rows"
49
+ DATA_INTEGRITY = "data_integrity"
50
+ SCHEMA_ISSUE = "schema_issue"
51
+ RESTORE_FAILURE = "restore_failure"
52
+ FK_VIOLATION = "fk_violation"
53
+ FK_PRUNED = "fk_pruned" # FK was intentionally not applied
54
+ POLICY_INCOHERENCE = "policy_incoherence"
55
+ INDEX_REBUILT = "index_rebuilt" # Index was dropped and rebuilt due to size limits
56
+
57
+
58
+ class OrphanStrategy(Enum):
59
+ """Strategy for handling orphan rows (rows with dangling FK references).
60
+
61
+ When cloning a catalog with incoherent row-level policies, some rows may
62
+ reference parent rows that are hidden from the cloning user. These orphan
63
+ rows would violate FK constraints.
64
+ """
65
+
66
+ FAIL = "fail" # Fail the clone if FK violations occur
67
+ DELETE = "delete" # Delete rows with dangling references
68
+ NULLIFY = "nullify" # Set dangling FK values to NULL (requires nullok)
69
+
70
+
71
+ class AssetCopyMode(Enum):
72
+ """How to handle assets during catalog cloning."""
73
+
74
+ NONE = "none" # Don't copy assets
75
+ REFERENCES = "refs" # Keep URLs pointing to source
76
+ FULL = "full" # Download and re-upload assets
77
+
78
+
79
+ @dataclass
80
+ class CloneIssue:
81
+ """A single issue encountered during catalog cloning."""
82
+
83
+ severity: CloneIssueSeverity
84
+ category: CloneIssueCategory
85
+ message: str
86
+ table: str | None = None
87
+ details: str | None = None
88
+ action: str | None = None
89
+ row_count: int = 0
90
+
91
+ def to_dict(self) -> dict[str, Any]:
92
+ return {
93
+ "severity": self.severity.value,
94
+ "category": self.category.value,
95
+ "message": self.message,
96
+ "table": self.table,
97
+ "details": self.details,
98
+ "action": self.action,
99
+ "row_count": self.row_count,
100
+ }
101
+
102
+ def __str__(self) -> str:
103
+ parts = [f"[{self.severity.value.upper()}]"]
104
+ if self.table:
105
+ parts.append(f"{self.table}:")
106
+ parts.append(self.message)
107
+ if self.row_count > 0:
108
+ parts.append(f"({self.row_count} rows)")
109
+ return " ".join(parts)
110
+
111
+
112
+ @dataclass
113
+ class CloneReport:
114
+ """Comprehensive report of catalog clone operation.
115
+
116
+ Tracks all issues encountered during cloning, including:
117
+ - Policy incoherence issues (FK violations due to hidden data)
118
+ - Orphan rows that were deleted or nullified
119
+ - FKs that were pruned or failed
120
+ - Tables that were restored, failed, or skipped
121
+
122
+ Provides both JSON and text output formats for reporting.
123
+ """
124
+
125
+ issues: list[CloneIssue] = field(default_factory=list)
126
+ tables_restored: dict[str, int] = field(default_factory=dict)
127
+ tables_failed: list[str] = field(default_factory=list)
128
+ tables_skipped: list[str] = field(default_factory=list)
129
+ orphan_details: dict[str, dict] = field(default_factory=dict)
130
+ fkeys_applied: int = 0
131
+ fkeys_failed: int = 0
132
+ fkeys_pruned: int = 0
133
+
134
+ def add_issue(self, issue: CloneIssue) -> None:
135
+ self.issues.append(issue)
136
+
137
+ def to_dict(self) -> dict[str, Any]:
138
+ """Return the report as a JSON-serializable dictionary."""
139
+ return {
140
+ "summary": {
141
+ "total_issues": len(self.issues),
142
+ "errors": len([i for i in self.issues if i.severity == CloneIssueSeverity.ERROR]),
143
+ "warnings": len([i for i in self.issues if i.severity == CloneIssueSeverity.WARNING]),
144
+ "tables_restored": len(self.tables_restored),
145
+ "tables_failed": len(self.tables_failed),
146
+ "tables_skipped": len(self.tables_skipped),
147
+ "total_rows_restored": sum(self.tables_restored.values()),
148
+ "orphan_rows_removed": sum(
149
+ d.get("rows_removed", 0) for d in self.orphan_details.values()
150
+ ),
151
+ "orphan_rows_nullified": sum(
152
+ d.get("rows_nullified", 0) for d in self.orphan_details.values()
153
+ ),
154
+ "fkeys_applied": self.fkeys_applied,
155
+ "fkeys_failed": self.fkeys_failed,
156
+ "fkeys_pruned": self.fkeys_pruned,
157
+ },
158
+ "issues": [i.to_dict() for i in self.issues],
159
+ "tables_restored": self.tables_restored,
160
+ "tables_failed": self.tables_failed,
161
+ "tables_skipped": self.tables_skipped,
162
+ "orphan_details": self.orphan_details,
163
+ }
164
+
165
+ def to_json(self, indent: int = 2) -> str:
166
+ """Return the report as a formatted JSON string."""
167
+ import json
168
+ return json.dumps(self.to_dict(), indent=indent)
169
+
170
+ def to_text(self) -> str:
171
+ """Return the report as human-readable text."""
172
+ lines = []
173
+ lines.append("=" * 70)
174
+ lines.append("CATALOG CLONE REPORT")
175
+ lines.append("=" * 70)
176
+ lines.append("")
177
+
178
+ # Summary
179
+ summary = self.to_dict()["summary"]
180
+ lines.append("SUMMARY")
181
+ lines.append("-" * 40)
182
+ lines.append(f"Tables restored: {summary['tables_restored']}")
183
+ lines.append(f"Tables failed: {summary['tables_failed']}")
184
+ lines.append(f"Tables skipped: {summary['tables_skipped']}")
185
+ lines.append(f"Total rows restored: {summary['total_rows_restored']}")
186
+ lines.append(f"Orphan rows removed: {summary['orphan_rows_removed']}")
187
+ lines.append(f"Orphan rows nullified: {summary['orphan_rows_nullified']}")
188
+ lines.append(f"FKs applied: {summary['fkeys_applied']}")
189
+ lines.append(f"FKs failed: {summary['fkeys_failed']}")
190
+ lines.append(f"FKs pruned: {summary['fkeys_pruned']}")
191
+ lines.append(f"Errors: {summary['errors']}")
192
+ lines.append(f"Warnings: {summary['warnings']}")
193
+ lines.append("")
194
+
195
+ # Issues by severity
196
+ if self.issues:
197
+ lines.append("ISSUES")
198
+ lines.append("-" * 40)
199
+
200
+ # Group by severity
201
+ for severity in [CloneIssueSeverity.CRITICAL, CloneIssueSeverity.ERROR,
202
+ CloneIssueSeverity.WARNING, CloneIssueSeverity.INFO]:
203
+ severity_issues = [i for i in self.issues if i.severity == severity]
204
+ if severity_issues:
205
+ lines.append(f"\n{severity.value.upper()} ({len(severity_issues)}):")
206
+ for issue in severity_issues:
207
+ lines.append(f" - {issue}")
208
+ if issue.details:
209
+ # Truncate long details
210
+ details = issue.details[:100] + "..." if len(issue.details) > 100 else issue.details
211
+ lines.append(f" Details: {details}")
212
+ if issue.action:
213
+ lines.append(f" Action: {issue.action}")
214
+ lines.append("")
215
+
216
+ # Orphan details
217
+ if self.orphan_details:
218
+ lines.append("ORPHAN ROW DETAILS")
219
+ lines.append("-" * 40)
220
+ for table, details in self.orphan_details.items():
221
+ removed = details.get("rows_removed", 0)
222
+ nullified = details.get("rows_nullified", 0)
223
+ lines.append(f" {table}:")
224
+ if removed > 0:
225
+ lines.append(f" Rows deleted: {removed}")
226
+ if nullified > 0:
227
+ lines.append(f" Rows nullified: {nullified}")
228
+ missing = details.get("missing_references", {})
229
+ for ref_table, count in missing.items():
230
+ lines.append(f" -> missing references to {ref_table}: {count}")
231
+ lines.append("")
232
+
233
+ # Assessment
234
+ lines.append("CLONE ASSESSMENT")
235
+ lines.append("-" * 40)
236
+ if summary['errors'] > 0:
237
+ lines.append("Clone completed with ERRORS. Some FKs could not be applied.")
238
+ lines.append("The catalog schema may be degraded.")
239
+ elif summary['orphan_rows_removed'] > 0 or summary['orphan_rows_nullified'] > 0:
240
+ lines.append("Clone completed with orphan handling.")
241
+ lines.append("Source catalog may have incoherent row-level policies.")
242
+ elif summary['fkeys_pruned'] > 0:
243
+ lines.append("Clone completed with pruned FKs.")
244
+ lines.append("Some FK constraints were skipped due to hidden reference data.")
245
+ else:
246
+ lines.append("Clone completed successfully.")
247
+ lines.append("")
248
+ lines.append("=" * 70)
249
+
250
+ return "\n".join(lines)
251
+
252
+ def __str__(self) -> str:
253
+ """Return text representation of the report."""
254
+ return self.to_text()
255
+
256
+
257
+ @dataclass
258
+ class AssetFilter:
259
+ """Filter for selecting which assets to copy during cloning."""
260
+
261
+ tables: list[str] | None = None
262
+ rids: list[str] | None = None
263
+
264
+
265
+ @dataclass
266
+ class TruncatedValue:
267
+ """Record of a value that was truncated during cloning."""
268
+
269
+ table: str
270
+ rid: str
271
+ column: str
272
+ original_bytes: int
273
+ truncated_bytes: int
274
+
275
+ def to_dict(self) -> dict[str, Any]:
276
+ return {
277
+ "table": self.table,
278
+ "rid": self.rid,
279
+ "column": self.column,
280
+ "original_bytes": self.original_bytes,
281
+ "truncated_bytes": self.truncated_bytes,
282
+ }
283
+
284
+
285
+ @dataclass
286
+ class CloneCatalogResult:
287
+ """Result of a catalog clone operation."""
288
+
289
+ catalog_id: str
290
+ hostname: str
291
+ schema_only: bool
292
+ asset_mode: AssetCopyMode
293
+ source_hostname: str
294
+ source_catalog_id: str
295
+ source_snapshot: str | None = None
296
+ alias: str | None = None
297
+ ml_schema_added: bool = False
298
+ datasets_reinitialized: int = 0
299
+ orphan_rows_removed: int = 0
300
+ orphan_rows_nullified: int = 0
301
+ fkeys_pruned: int = 0
302
+ rows_skipped: int = 0
303
+ truncated_values: list[TruncatedValue] = field(default_factory=list)
304
+ report: CloneReport | None = None
305
+
306
+
307
+ # Clone state annotation URL (same as deriva-py)
308
+ _clone_state_url = "tag:isrd.isi.edu,2018:clone-state"
309
+
310
+ # Catalog provenance annotation URL
311
+ _catalog_provenance_url = "tag:deriva-ml.org,2025:catalog-provenance"
312
+
313
+ # Pattern to detect btree index size errors
314
+ _BTREE_INDEX_ERROR_PATTERN = "index row size"
315
+ _BTREE_INDEX_NAME_PATTERN = r'for index "([^"]+)"'
316
+
317
+
318
+ class CatalogCreationMethod(Enum):
319
+ """How a catalog was created."""
320
+
321
+ CLONE = "clone" # Cloned from another catalog
322
+ CREATE = "create" # Created programmatically (e.g., create_catalog)
323
+ SCHEMA = "schema" # Created from schema definition
324
+ UNKNOWN = "unknown" # Unknown or pre-existing catalog
325
+
326
+
327
+ @dataclass
328
+ class CloneDetails:
329
+ """Details specific to cloned catalogs."""
330
+
331
+ source_hostname: str
332
+ source_catalog_id: str
333
+ source_snapshot: str | None = None
334
+ source_schema_url: str | None = None # Hatrac URL to source schema JSON
335
+ orphan_strategy: str = "fail"
336
+ truncate_oversized: bool = False
337
+ prune_hidden_fkeys: bool = False
338
+ schema_only: bool = False
339
+ asset_mode: str = "refs"
340
+ exclude_schemas: list[str] = field(default_factory=list)
341
+ exclude_objects: list[str] = field(default_factory=list)
342
+ rows_copied: int = 0
343
+ rows_skipped: int = 0
344
+ truncated_count: int = 0
345
+ orphan_rows_removed: int = 0
346
+ orphan_rows_nullified: int = 0
347
+ fkeys_pruned: int = 0
348
+
349
+ def to_dict(self) -> dict[str, Any]:
350
+ return {
351
+ "source_hostname": self.source_hostname,
352
+ "source_catalog_id": self.source_catalog_id,
353
+ "source_snapshot": self.source_snapshot,
354
+ "source_schema_url": self.source_schema_url,
355
+ "orphan_strategy": self.orphan_strategy,
356
+ "truncate_oversized": self.truncate_oversized,
357
+ "prune_hidden_fkeys": self.prune_hidden_fkeys,
358
+ "schema_only": self.schema_only,
359
+ "asset_mode": self.asset_mode,
360
+ "exclude_schemas": self.exclude_schemas,
361
+ "exclude_objects": self.exclude_objects,
362
+ "rows_copied": self.rows_copied,
363
+ "rows_skipped": self.rows_skipped,
364
+ "truncated_count": self.truncated_count,
365
+ "orphan_rows_removed": self.orphan_rows_removed,
366
+ "orphan_rows_nullified": self.orphan_rows_nullified,
367
+ "fkeys_pruned": self.fkeys_pruned,
368
+ }
369
+
370
+ @classmethod
371
+ def from_dict(cls, data: dict[str, Any]) -> "CloneDetails":
372
+ return cls(
373
+ source_hostname=data.get("source_hostname", ""),
374
+ source_catalog_id=data.get("source_catalog_id", ""),
375
+ source_snapshot=data.get("source_snapshot"),
376
+ source_schema_url=data.get("source_schema_url"),
377
+ orphan_strategy=data.get("orphan_strategy", "fail"),
378
+ truncate_oversized=data.get("truncate_oversized", False),
379
+ prune_hidden_fkeys=data.get("prune_hidden_fkeys", False),
380
+ schema_only=data.get("schema_only", False),
381
+ asset_mode=data.get("asset_mode", "refs"),
382
+ exclude_schemas=data.get("exclude_schemas", []),
383
+ exclude_objects=data.get("exclude_objects", []),
384
+ rows_copied=data.get("rows_copied", 0),
385
+ rows_skipped=data.get("rows_skipped", 0),
386
+ truncated_count=data.get("truncated_count", 0),
387
+ orphan_rows_removed=data.get("orphan_rows_removed", 0),
388
+ orphan_rows_nullified=data.get("orphan_rows_nullified", 0),
389
+ fkeys_pruned=data.get("fkeys_pruned", 0),
390
+ )
391
+
392
+
393
+ @dataclass
394
+ class CatalogProvenance:
395
+ """Provenance information for a catalog.
396
+
397
+ This metadata is stored as a catalog-level annotation and tracks
398
+ how the catalog was created, by whom, and with what parameters.
399
+ Supports both cloned catalogs and catalogs created by other means.
400
+
401
+ Attributes:
402
+ creation_method: How the catalog was created (clone, create, schema, unknown).
403
+ created_at: ISO timestamp when the catalog was created.
404
+ created_by: User or system that created the catalog (Globus identity or description).
405
+ hostname: Hostname where the catalog resides.
406
+ catalog_id: Catalog ID.
407
+ name: Human-readable name for the catalog.
408
+ description: Description of the catalog's purpose.
409
+ workflow_url: URL to the workflow/script that created the catalog (e.g., GitHub URL).
410
+ workflow_version: Version of the workflow (e.g., git commit hash, package version).
411
+ clone_details: If cloned, detailed information about the clone operation.
412
+ """
413
+
414
+ creation_method: CatalogCreationMethod
415
+ created_at: str
416
+ hostname: str
417
+ catalog_id: str
418
+ created_by: str | None = None
419
+ name: str | None = None
420
+ description: str | None = None
421
+ workflow_url: str | None = None
422
+ workflow_version: str | None = None
423
+ clone_details: CloneDetails | None = None
424
+
425
+ def to_dict(self) -> dict[str, Any]:
426
+ result = {
427
+ "creation_method": self.creation_method.value,
428
+ "created_at": self.created_at,
429
+ "hostname": self.hostname,
430
+ "catalog_id": self.catalog_id,
431
+ "created_by": self.created_by,
432
+ "name": self.name,
433
+ "description": self.description,
434
+ "workflow_url": self.workflow_url,
435
+ "workflow_version": self.workflow_version,
436
+ }
437
+ if self.clone_details:
438
+ result["clone_details"] = self.clone_details.to_dict()
439
+ return result
440
+
441
+ @classmethod
442
+ def from_dict(cls, data: dict[str, Any]) -> "CatalogProvenance":
443
+ clone_details = None
444
+ if data.get("clone_details"):
445
+ clone_details = CloneDetails.from_dict(data["clone_details"])
446
+
447
+ # Handle legacy format where creation_method might be missing
448
+ method_str = data.get("creation_method", "unknown")
449
+ try:
450
+ creation_method = CatalogCreationMethod(method_str)
451
+ except ValueError:
452
+ creation_method = CatalogCreationMethod.UNKNOWN
453
+
454
+ return cls(
455
+ creation_method=creation_method,
456
+ created_at=data.get("created_at", ""),
457
+ hostname=data.get("hostname", ""),
458
+ catalog_id=data.get("catalog_id", ""),
459
+ created_by=data.get("created_by"),
460
+ name=data.get("name"),
461
+ description=data.get("description"),
462
+ workflow_url=data.get("workflow_url"),
463
+ workflow_version=data.get("workflow_version"),
464
+ clone_details=clone_details,
465
+ )
466
+
467
+ @property
468
+ def is_clone(self) -> bool:
469
+ """Return True if this catalog was cloned from another catalog."""
470
+ return self.creation_method == CatalogCreationMethod.CLONE and self.clone_details is not None
471
+
472
+
473
+ def _upload_source_schema(
474
+ hostname: str,
475
+ catalog_id: str,
476
+ schema_json: dict[str, Any],
477
+ credential: dict | None,
478
+ ) -> str | None:
479
+ """Upload source schema JSON to Hatrac.
480
+
481
+ Args:
482
+ hostname: Destination catalog hostname.
483
+ catalog_id: Destination catalog ID.
484
+ schema_json: The source schema as a dictionary.
485
+ credential: Credential for Hatrac access.
486
+
487
+ Returns:
488
+ Hatrac URL for the uploaded schema, or None if upload failed.
489
+ """
490
+ try:
491
+ cred = credential or get_credential(hostname)
492
+ hatrac = HatracStore("https", hostname, credentials=cred)
493
+
494
+ # Create namespace for catalog provenance metadata if it doesn't exist
495
+ namespace = f"/hatrac/catalog/{catalog_id}/provenance"
496
+ try:
497
+ hatrac.create_namespace(namespace, parents=True)
498
+ except Exception:
499
+ pass # Namespace may already exist
500
+
501
+ # Upload schema JSON
502
+ schema_bytes = json.dumps(schema_json, indent=2).encode("utf-8")
503
+ object_path = f"{namespace}/source-schema.json"
504
+
505
+ url = hatrac.put_obj(
506
+ object_path,
507
+ schema_bytes,
508
+ content_type="application/json",
509
+ )
510
+
511
+ logger.info(f"Uploaded source schema to {url}")
512
+ return url
513
+
514
+ except Exception as e:
515
+ logger.warning(f"Failed to upload source schema to Hatrac: {e}")
516
+ return None
517
+
518
+
519
+ def _set_catalog_provenance(
520
+ dst_catalog: ErmrestCatalog,
521
+ provenance: CatalogProvenance,
522
+ ) -> None:
523
+ """Set the catalog provenance annotation on a catalog.
524
+
525
+ Args:
526
+ dst_catalog: Catalog connection.
527
+ provenance: Catalog provenance information.
528
+ """
529
+ try:
530
+ dst_catalog.put(
531
+ f"/annotation/{urlquote(_catalog_provenance_url)}",
532
+ json=provenance.to_dict(),
533
+ )
534
+ logger.info("Set catalog provenance annotation")
535
+ except Exception as e:
536
+ logger.warning(f"Failed to set catalog provenance annotation: {e}")
537
+
538
+
539
+ def set_catalog_provenance(
540
+ catalog: ErmrestCatalog,
541
+ name: str | None = None,
542
+ description: str | None = None,
543
+ workflow_url: str | None = None,
544
+ workflow_version: str | None = None,
545
+ creation_method: CatalogCreationMethod = CatalogCreationMethod.CREATE,
546
+ ) -> CatalogProvenance:
547
+ """Set catalog provenance information for a newly created catalog.
548
+
549
+ Use this function when creating a catalog programmatically to record
550
+ how and why it was created. This is similar to workflow metadata but
551
+ at the catalog level.
552
+
553
+ Args:
554
+ catalog: The catalog to annotate.
555
+ name: Human-readable name for the catalog.
556
+ description: Description of the catalog's purpose.
557
+ workflow_url: URL to the workflow/script that created the catalog
558
+ (e.g., GitHub URL, notebook URL).
559
+ workflow_version: Version of the workflow (e.g., git commit hash,
560
+ package version, or semantic version).
561
+ creation_method: How the catalog was created. Defaults to CREATE.
562
+
563
+ Returns:
564
+ The CatalogProvenance object that was set.
565
+
566
+ Example:
567
+ >>> from deriva_ml.catalog import set_catalog_provenance, CatalogCreationMethod
568
+ >>> provenance = set_catalog_provenance(
569
+ ... catalog,
570
+ ... name="CIFAR-10 Training Catalog",
571
+ ... description="Catalog for CIFAR-10 image classification experiments",
572
+ ... workflow_url="https://github.com/org/repo/blob/main/setup_catalog.py",
573
+ ... workflow_version="v1.2.0",
574
+ ... )
575
+ """
576
+ # Try to get current user identity
577
+ created_by = None
578
+ try:
579
+ # Get user info from catalog session
580
+ session_info = catalog.get("/authn/session").json()
581
+ if session_info and "client" in session_info:
582
+ client = session_info["client"]
583
+ created_by = client.get("display_name") or client.get("id")
584
+ except Exception:
585
+ pass
586
+
587
+ # Get catalog info
588
+ try:
589
+ catalog_info = catalog.get("/").json()
590
+ hostname = catalog_info.get("meta", {}).get("host", "")
591
+ catalog_id = str(catalog.catalog_id)
592
+ except Exception:
593
+ hostname = ""
594
+ catalog_id = str(catalog.catalog_id)
595
+
596
+ provenance = CatalogProvenance(
597
+ creation_method=creation_method,
598
+ created_at=datetime.now(timezone.utc).isoformat(),
599
+ hostname=hostname,
600
+ catalog_id=catalog_id,
601
+ created_by=created_by,
602
+ name=name,
603
+ description=description,
604
+ workflow_url=workflow_url,
605
+ workflow_version=workflow_version,
606
+ )
607
+
608
+ _set_catalog_provenance(catalog, provenance)
609
+ return provenance
610
+
611
+
612
+ def get_catalog_provenance(catalog: ErmrestCatalog) -> CatalogProvenance | None:
613
+ """Get the catalog provenance information.
614
+
615
+ Returns provenance information if the catalog has it set. This includes
616
+ information about how the catalog was created (clone, create, schema),
617
+ who created it, and any workflow information.
618
+
619
+ Args:
620
+ catalog: The catalog to check.
621
+
622
+ Returns:
623
+ CatalogProvenance if available, None otherwise.
624
+ """
625
+ try:
626
+ model = catalog.getCatalogModel()
627
+ provenance_data = model.annotations.get(_catalog_provenance_url)
628
+ if provenance_data:
629
+ return CatalogProvenance.from_dict(provenance_data)
630
+ except Exception as e:
631
+ logger.debug(f"Could not get catalog provenance: {e}")
632
+
633
+ return None
634
+
635
+
636
+ def _parse_index_error(error_msg: str) -> tuple[str | None, str | None]:
637
+ """Parse a btree index size error to extract index name and column.
638
+
639
+ Args:
640
+ error_msg: The error message from ERMrest/PostgreSQL.
641
+
642
+ Returns:
643
+ Tuple of (index_name, column_name) if this is an index size error,
644
+ (None, None) otherwise.
645
+ """
646
+ import re
647
+
648
+ if _BTREE_INDEX_ERROR_PATTERN not in error_msg:
649
+ return None, None
650
+
651
+ # Extract index name from error message
652
+ match = re.search(_BTREE_INDEX_NAME_PATTERN, error_msg)
653
+ if not match:
654
+ return None, None
655
+
656
+ index_name = match.group(1)
657
+
658
+ # Try to extract column name from index name (common pattern: table__column_idx)
659
+ # e.g., "dataset__keywords_idx" -> "keywords"
660
+ if "__" in index_name and index_name.endswith("_idx"):
661
+ parts = index_name.rsplit("__", 1)
662
+ if len(parts) == 2:
663
+ column_name = parts[1].replace("_idx", "")
664
+ return index_name, column_name
665
+
666
+ return index_name, None
667
+
668
+
669
+
670
+
671
+ def _copy_table_data_with_retry(
672
+ src_catalog: ErmrestCatalog,
673
+ dst_catalog: ErmrestCatalog,
674
+ sname: str,
675
+ tname: str,
676
+ page_size: int,
677
+ report: "CloneReport",
678
+ deferred_indexes: dict[str, list[dict]],
679
+ truncate_oversized: bool = False,
680
+ ) -> tuple[int, int, list[TruncatedValue]]:
681
+ """Copy data for a single table with retry logic for index errors.
682
+
683
+ If a btree index size error occurs, this function will:
684
+ 1. Detect the problematic index and column
685
+ 2. Switch to row-by-row insertion mode
686
+ 3. Either truncate oversized values (if truncate_oversized=True) or skip rows
687
+ 4. Record skipped/truncated rows in the report
688
+
689
+ Args:
690
+ src_catalog: Source catalog connection.
691
+ dst_catalog: Destination catalog connection.
692
+ sname: Schema name.
693
+ tname: Table name.
694
+ page_size: Number of rows per page.
695
+ report: Clone report for recording issues.
696
+ deferred_indexes: Dict to collect indexes that need rebuilding.
697
+ Key is "schema:table", value is list of index definitions.
698
+ truncate_oversized: If True, truncate oversized values instead of skipping rows.
699
+
700
+ Returns:
701
+ Tuple of (rows_copied, rows_skipped, truncated_values).
702
+ rows_copied is -1 if the copy failed entirely.
703
+ """
704
+ tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
705
+ table_key = f"{sname}:{tname}"
706
+
707
+ # Maximum safe size for btree index values (with margin below 2704 limit)
708
+ MAX_INDEX_VALUE_BYTES = 2600
709
+ TRUNCATE_SUFFIX = "...[TRUNCATED]"
710
+
711
+ last = None
712
+ table_rows = 0
713
+ rows_skipped = 0
714
+ truncated_values: list[TruncatedValue] = []
715
+ row_by_row_mode = False
716
+ problematic_index = None
717
+ problematic_column = None
718
+
719
+ def truncate_row_values(row: dict, column: str | None) -> tuple[dict, list[TruncatedValue]]:
720
+ """Truncate oversized text values in a row.
721
+
722
+ Returns the modified row and list of truncation records.
723
+ """
724
+ truncations = []
725
+ modified_row = row.copy()
726
+ rid = row.get('RID', 'unknown')
727
+
728
+ # If we know the problematic column, only check that one
729
+ columns_to_check = [column] if column else list(row.keys())
730
+
731
+ for col in columns_to_check:
732
+ if col not in modified_row:
733
+ continue
734
+ value = modified_row[col]
735
+ if isinstance(value, str):
736
+ value_bytes = len(value.encode('utf-8'))
737
+ if value_bytes > MAX_INDEX_VALUE_BYTES:
738
+ # Truncate to safe size, accounting for suffix
739
+ max_chars = MAX_INDEX_VALUE_BYTES - len(TRUNCATE_SUFFIX.encode('utf-8'))
740
+ # Be conservative - truncate by character count as approximation
741
+ # since UTF-8 chars can be multi-byte
742
+ truncated = value[:max_chars] + TRUNCATE_SUFFIX
743
+ # Verify the result fits
744
+ while len(truncated.encode('utf-8')) > MAX_INDEX_VALUE_BYTES:
745
+ max_chars -= 100
746
+ truncated = value[:max_chars] + TRUNCATE_SUFFIX
747
+
748
+ modified_row[col] = truncated
749
+ truncations.append(TruncatedValue(
750
+ table=table_key,
751
+ rid=str(rid),
752
+ column=col,
753
+ original_bytes=value_bytes,
754
+ truncated_bytes=len(truncated.encode('utf-8')),
755
+ ))
756
+ logger.debug(
757
+ f"Truncated {table_key}.{col} for RID {rid}: "
758
+ f"{value_bytes} -> {len(truncated.encode('utf-8'))} bytes"
759
+ )
760
+
761
+ return modified_row, truncations
762
+
763
+ while True:
764
+ after_clause = f"@after({urlquote(last)})" if last else ""
765
+ try:
766
+ page = src_catalog.get(
767
+ f"/entity/{tname_uri}@sort(RID){after_clause}?limit={page_size}"
768
+ ).json()
769
+ except Exception as e:
770
+ logger.warning(f"Failed to read from {sname}:{tname}: {e}")
771
+ return -1, rows_skipped, truncated_values
772
+
773
+ if not page:
774
+ break
775
+
776
+ if row_by_row_mode:
777
+ # Insert rows one at a time, handling oversized values
778
+ for row in page:
779
+ row_to_insert = row
780
+
781
+ # If truncation is enabled, try to truncate first
782
+ if truncate_oversized and problematic_column:
783
+ row_to_insert, truncations = truncate_row_values(row, problematic_column)
784
+ truncated_values.extend(truncations)
785
+
786
+ try:
787
+ dst_catalog.post(
788
+ f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
789
+ json=[row_to_insert]
790
+ )
791
+ table_rows += 1
792
+ except Exception as row_error:
793
+ error_msg = str(row_error)
794
+ if _BTREE_INDEX_ERROR_PATTERN in error_msg:
795
+ # This row has a value too large for the index
796
+ if truncate_oversized:
797
+ # Try truncating all text columns
798
+ row_to_insert, truncations = truncate_row_values(row, None)
799
+ truncated_values.extend(truncations)
800
+ try:
801
+ dst_catalog.post(
802
+ f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
803
+ json=[row_to_insert]
804
+ )
805
+ table_rows += 1
806
+ continue
807
+ except Exception:
808
+ pass # Fall through to skip
809
+
810
+ rows_skipped += 1
811
+ rid = row.get('RID', 'unknown')
812
+ logger.debug(f"Skipping row {rid} in {table_key} due to index size limit")
813
+ else:
814
+ # Different error - log and skip
815
+ rows_skipped += 1
816
+ logger.debug(f"Skipping row in {table_key}: {row_error}")
817
+ last = page[-1]['RID']
818
+ else:
819
+ # Normal batch mode
820
+ try:
821
+ dst_catalog.post(
822
+ f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
823
+ json=page
824
+ )
825
+ last = page[-1]['RID']
826
+ table_rows += len(page)
827
+ except Exception as e:
828
+ error_msg = str(e)
829
+
830
+ # Check if this is a btree index size error
831
+ index_name, column_name = _parse_index_error(error_msg)
832
+
833
+ if index_name:
834
+ action_desc = "Values will be truncated" if truncate_oversized else "Rows with oversized values will be skipped"
835
+ logger.info(
836
+ f"Detected btree index size error for '{index_name}' on {table_key}. "
837
+ f"Switching to row-by-row mode. {action_desc}."
838
+ )
839
+ problematic_index = index_name
840
+ problematic_column = column_name
841
+ row_by_row_mode = True
842
+
843
+ # Record the issue
844
+ report.add_issue(CloneIssue(
845
+ severity=CloneIssueSeverity.WARNING,
846
+ category=CloneIssueCategory.INDEX_REBUILT,
847
+ message=f"Index '{index_name}' has oversized values, using row-by-row mode",
848
+ table=table_key,
849
+ details=f"Column '{column_name}' has values exceeding btree 2704 byte limit",
850
+ action=action_desc,
851
+ ))
852
+
853
+ # Retry this page in row-by-row mode
854
+ for row in page:
855
+ row_to_insert = row
856
+
857
+ # If truncation is enabled, try to truncate first
858
+ if truncate_oversized and problematic_column:
859
+ row_to_insert, truncations = truncate_row_values(row, problematic_column)
860
+ truncated_values.extend(truncations)
861
+
862
+ try:
863
+ dst_catalog.post(
864
+ f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
865
+ json=[row_to_insert]
866
+ )
867
+ table_rows += 1
868
+ except Exception as row_error:
869
+ error_msg_row = str(row_error)
870
+ if _BTREE_INDEX_ERROR_PATTERN in error_msg_row:
871
+ # Try truncating all columns if not already done
872
+ if truncate_oversized:
873
+ row_to_insert, truncations = truncate_row_values(row, None)
874
+ truncated_values.extend(truncations)
875
+ try:
876
+ dst_catalog.post(
877
+ f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
878
+ json=[row_to_insert]
879
+ )
880
+ table_rows += 1
881
+ continue
882
+ except Exception:
883
+ pass # Fall through to skip
884
+
885
+ rows_skipped += 1
886
+ rid = row.get('RID', 'unknown')
887
+ logger.debug(f"Skipping row {rid} due to index size limit")
888
+ else:
889
+ rows_skipped += 1
890
+ logger.debug(f"Skipping row: {row_error}")
891
+ last = page[-1]['RID']
892
+ else:
893
+ logger.warning(f"Failed to write to {sname}:{tname}: {e}")
894
+ return -1, rows_skipped, truncated_values
895
+
896
+ # Report skipped rows
897
+ if rows_skipped > 0:
898
+ report.add_issue(CloneIssue(
899
+ severity=CloneIssueSeverity.WARNING,
900
+ category=CloneIssueCategory.DATA_INTEGRITY,
901
+ message=f"Skipped {rows_skipped} rows due to index size limits",
902
+ table=table_key,
903
+ details=f"Index '{problematic_index}' on column '{problematic_column}'",
904
+ action="These rows have values too large for btree index (>2704 bytes)",
905
+ row_count=rows_skipped,
906
+ ))
907
+ logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits")
908
+
909
+ # Report truncated values
910
+ if truncated_values:
911
+ report.add_issue(CloneIssue(
912
+ severity=CloneIssueSeverity.INFO,
913
+ category=CloneIssueCategory.DATA_INTEGRITY,
914
+ message=f"Truncated {len(truncated_values)} values to fit index size limits",
915
+ table=table_key,
916
+ details=f"Values in column '{problematic_column}' were truncated to <{MAX_INDEX_VALUE_BYTES} bytes",
917
+ action="Original data was preserved with '[TRUNCATED]' suffix",
918
+ row_count=len(truncated_values),
919
+ ))
920
+ logger.info(f"Truncated {len(truncated_values)} values in {table_key}")
921
+
922
+ return table_rows, rows_skipped, truncated_values
923
+
924
+
925
+
926
+
927
+ def _rebuild_deferred_indexes(
928
+ dst_catalog: ErmrestCatalog,
929
+ deferred_indexes: dict[str, list[dict]],
930
+ report: "CloneReport",
931
+ ) -> None:
932
+ """Note any indexes that had issues during data copy.
933
+
934
+ This function is called after data copy to report on any index-related
935
+ issues that were encountered. Since ERMrest doesn't provide direct index
936
+ management, we can only report these issues for manual follow-up.
937
+
938
+ Args:
939
+ dst_catalog: Destination catalog.
940
+ deferred_indexes: Dict of table -> list of index definitions with issues.
941
+ report: Clone report.
942
+ """
943
+ if not deferred_indexes:
944
+ return
945
+
946
+ logger.info(f"Reporting {sum(len(v) for v in deferred_indexes.values())} index issues...")
947
+
948
+
949
+ def clone_catalog(
950
+ source_hostname: str,
951
+ source_catalog_id: str,
952
+ dest_hostname: str | None = None,
953
+ alias: str | None = None,
954
+ add_ml_schema: bool = False,
955
+ schema_only: bool = False,
956
+ asset_mode: AssetCopyMode = AssetCopyMode.REFERENCES,
957
+ asset_filter: AssetFilter | None = None,
958
+ copy_annotations: bool = True,
959
+ copy_policy: bool = True,
960
+ exclude_schemas: list[str] | None = None,
961
+ exclude_objects: list[str] | None = None,
962
+ source_credential: dict | None = None,
963
+ dest_credential: dict | None = None,
964
+ reinitialize_dataset_versions: bool = True,
965
+ orphan_strategy: OrphanStrategy = OrphanStrategy.FAIL,
966
+ prune_hidden_fkeys: bool = False,
967
+ truncate_oversized: bool = False,
968
+ ) -> CloneCatalogResult:
969
+ """Clone a catalog with robust handling of policy-induced FK violations.
970
+
971
+ This function handles the common case where source catalog policies are
972
+ incoherent - some domain tables have row-level policies hiding data, but
973
+ referring tables don't have matching policies, leading to visible references
974
+ to invisible rows.
975
+
976
+ Uses a three-stage approach:
977
+ 1. Create schema WITHOUT foreign keys
978
+ 2. Copy all accessible data
979
+ 3. Apply foreign keys, handling violations based on orphan_strategy
980
+
981
+ Args:
982
+ source_hostname: Hostname of the source catalog server.
983
+ source_catalog_id: ID of the catalog to clone.
984
+ dest_hostname: Destination hostname. If None, clones to same server.
985
+ alias: Optional alias name for the new catalog.
986
+ add_ml_schema: If True, add the DerivaML schema to the clone.
987
+ schema_only: If True, copy only schema structure without data.
988
+ asset_mode: How to handle assets during cloning.
989
+ asset_filter: Optional filter to selectively copy assets.
990
+ copy_annotations: If True (default), copy all catalog annotations.
991
+ copy_policy: If True (default), copy ACL policies (requires ownership).
992
+ exclude_schemas: List of schema names to exclude from cloning.
993
+ exclude_objects: List of specific tables to exclude ("schema:table").
994
+ source_credential: Optional credential dict for source server.
995
+ dest_credential: Optional credential dict for destination server.
996
+ reinitialize_dataset_versions: If True, reset dataset versions for clone.
997
+ orphan_strategy: How to handle rows with dangling FK references:
998
+ - FAIL: Abort if FK violations occur (default)
999
+ - DELETE: Delete orphan rows
1000
+ - NULLIFY: Set dangling FK values to NULL
1001
+ prune_hidden_fkeys: If True, skip FKs where referenced columns have
1002
+ "select": null rights (indicating potentially hidden data). This
1003
+ prevents FK violations but degrades schema structure.
1004
+ truncate_oversized: If True, automatically truncate text values that
1005
+ exceed PostgreSQL's btree index size limit (2704 bytes). Truncated
1006
+ values will have "...[TRUNCATED]" appended. If False (default),
1007
+ rows with oversized values are skipped. All truncations are recorded
1008
+ in the result's truncated_values list.
1009
+
1010
+ Returns:
1011
+ CloneCatalogResult with details of the cloned catalog, including:
1012
+ - truncated_values: List of TruncatedValue records for any values
1013
+ that were truncated due to index size limits.
1014
+ - rows_skipped: Count of rows skipped due to index size limits
1015
+ (when truncate_oversized=False).
1016
+
1017
+ Raises:
1018
+ ValueError: If invalid parameters or FK violations with FAIL strategy.
1019
+
1020
+ Example:
1021
+ >>> # Clone with orphan deletion
1022
+ >>> result = clone_catalog(
1023
+ ... "source.org", "21",
1024
+ ... dest_hostname="localhost",
1025
+ ... orphan_strategy=OrphanStrategy.DELETE,
1026
+ ... )
1027
+
1028
+ >>> # Conservative clone that prunes problematic FKs
1029
+ >>> result = clone_catalog(
1030
+ ... "source.org", "21",
1031
+ ... dest_hostname="localhost",
1032
+ ... prune_hidden_fkeys=True,
1033
+ ... )
1034
+ """
1035
+ # Determine destination
1036
+ is_same_server = dest_hostname is None or dest_hostname == source_hostname
1037
+ effective_dest_hostname = source_hostname if dest_hostname is None else dest_hostname
1038
+
1039
+ # Get source snapshot for provenance
1040
+ source_snapshot = _get_catalog_snapshot(
1041
+ source_hostname, source_catalog_id, source_credential
1042
+ )
1043
+
1044
+ # Connect to source
1045
+ src_cred = source_credential or get_credential(source_hostname)
1046
+ src_server = DerivaServer("https", source_hostname, credentials=src_cred)
1047
+ src_catalog = src_server.connect_ermrest(source_catalog_id)
1048
+
1049
+ # Capture source schema for provenance before any modifications
1050
+ source_schema_json = src_catalog.get("/schema").json()
1051
+
1052
+ # Connect to destination and create new catalog
1053
+ if is_same_server:
1054
+ dst_cred = src_cred
1055
+ dst_server = src_server
1056
+ else:
1057
+ dst_cred = dest_credential or get_credential(effective_dest_hostname)
1058
+ dst_server = DerivaServer("https", effective_dest_hostname, credentials=dst_cred)
1059
+
1060
+ dst_catalog = dst_server.create_ermrest_catalog(
1061
+ name=f"Clone of {source_catalog_id}",
1062
+ description=f"Cloned from {source_hostname}:{source_catalog_id}",
1063
+ )
1064
+
1065
+ report = CloneReport()
1066
+
1067
+ # Track truncated values
1068
+ truncated_values: list[TruncatedValue] = []
1069
+ rows_skipped = 0
1070
+
1071
+ # Record clone timestamp
1072
+ clone_timestamp = datetime.now(timezone.utc).isoformat()
1073
+
1074
+ # Perform the three-stage clone
1075
+ orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values = _clone_three_stage(
1076
+ src_catalog=src_catalog,
1077
+ dst_catalog=dst_catalog,
1078
+ copy_data=not schema_only,
1079
+ copy_annotations=copy_annotations,
1080
+ copy_policy=copy_policy,
1081
+ exclude_schemas=exclude_schemas or [],
1082
+ exclude_objects=exclude_objects or [],
1083
+ orphan_strategy=orphan_strategy,
1084
+ prune_hidden_fkeys=prune_hidden_fkeys,
1085
+ truncate_oversized=truncate_oversized,
1086
+ report=report,
1087
+ )
1088
+
1089
+ result = CloneCatalogResult(
1090
+ catalog_id=str(dst_catalog.catalog_id),
1091
+ hostname=effective_dest_hostname,
1092
+ schema_only=schema_only,
1093
+ asset_mode=asset_mode,
1094
+ source_hostname=source_hostname,
1095
+ source_catalog_id=source_catalog_id,
1096
+ source_snapshot=source_snapshot,
1097
+ orphan_rows_removed=orphan_rows_removed,
1098
+ orphan_rows_nullified=orphan_rows_nullified,
1099
+ fkeys_pruned=fkeys_pruned,
1100
+ rows_skipped=rows_skipped,
1101
+ truncated_values=truncated_values,
1102
+ report=report,
1103
+ )
1104
+
1105
+ # Upload source schema to Hatrac and set catalog provenance
1106
+ source_schema_url = _upload_source_schema(
1107
+ hostname=effective_dest_hostname,
1108
+ catalog_id=result.catalog_id,
1109
+ schema_json=source_schema_json,
1110
+ credential=dst_cred,
1111
+ )
1112
+
1113
+ # Calculate total rows copied from report
1114
+ total_rows_copied = sum(report.tables_restored.values())
1115
+
1116
+ # Try to get current user identity
1117
+ created_by = None
1118
+ try:
1119
+ session_info = dst_catalog.get("/authn/session").json()
1120
+ if session_info and "client" in session_info:
1121
+ client = session_info["client"]
1122
+ created_by = client.get("display_name") or client.get("id")
1123
+ except Exception:
1124
+ pass
1125
+
1126
+ # Create clone details
1127
+ clone_details = CloneDetails(
1128
+ source_hostname=source_hostname,
1129
+ source_catalog_id=source_catalog_id,
1130
+ source_snapshot=source_snapshot,
1131
+ source_schema_url=source_schema_url,
1132
+ orphan_strategy=orphan_strategy.value,
1133
+ truncate_oversized=truncate_oversized,
1134
+ prune_hidden_fkeys=prune_hidden_fkeys,
1135
+ schema_only=schema_only,
1136
+ asset_mode=asset_mode.value,
1137
+ exclude_schemas=exclude_schemas or [],
1138
+ exclude_objects=exclude_objects or [],
1139
+ rows_copied=total_rows_copied,
1140
+ rows_skipped=rows_skipped,
1141
+ truncated_count=len(truncated_values),
1142
+ orphan_rows_removed=orphan_rows_removed,
1143
+ orphan_rows_nullified=orphan_rows_nullified,
1144
+ fkeys_pruned=fkeys_pruned,
1145
+ )
1146
+
1147
+ # Create and set catalog provenance annotation
1148
+ provenance = CatalogProvenance(
1149
+ creation_method=CatalogCreationMethod.CLONE,
1150
+ created_at=clone_timestamp,
1151
+ hostname=effective_dest_hostname,
1152
+ catalog_id=result.catalog_id,
1153
+ created_by=created_by,
1154
+ name=alias or f"Clone of {source_catalog_id}",
1155
+ description=f"Cloned from {source_hostname}:{source_catalog_id}",
1156
+ clone_details=clone_details,
1157
+ )
1158
+ _set_catalog_provenance(dst_catalog, provenance)
1159
+
1160
+ # Post-clone operations
1161
+ result = _post_clone_operations(
1162
+ result=result,
1163
+ alias=alias,
1164
+ add_ml_schema=add_ml_schema,
1165
+ credential=dst_cred,
1166
+ )
1167
+
1168
+ if reinitialize_dataset_versions and not schema_only:
1169
+ result = _reinitialize_dataset_versions(
1170
+ result=result,
1171
+ credential=dst_cred,
1172
+ )
1173
+
1174
+ return result
1175
+
1176
+
1177
+ def _clone_three_stage(
1178
+ src_catalog: ErmrestCatalog,
1179
+ dst_catalog: ErmrestCatalog,
1180
+ copy_data: bool,
1181
+ copy_annotations: bool,
1182
+ copy_policy: bool,
1183
+ exclude_schemas: list[str],
1184
+ exclude_objects: list[str],
1185
+ orphan_strategy: OrphanStrategy,
1186
+ prune_hidden_fkeys: bool,
1187
+ truncate_oversized: bool,
1188
+ report: CloneReport,
1189
+ ) -> tuple[int, int, int, int, list[TruncatedValue]]:
1190
+ """Perform three-stage catalog cloning.
1191
+
1192
+ Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values)
1193
+ """
1194
+ src_model = src_catalog.getCatalogModel()
1195
+
1196
+ # Parse exclude_objects
1197
+ excluded_tables: set[tuple[str, str]] = set()
1198
+ for obj in exclude_objects:
1199
+ if ":" in obj:
1200
+ schema, table = obj.split(":", 1)
1201
+ excluded_tables.add((schema, table))
1202
+
1203
+ # Set top-level config
1204
+ if copy_policy and src_model.acls:
1205
+ try:
1206
+ dst_catalog.put('/acl', json=src_model.acls)
1207
+ except Exception as e:
1208
+ logger.warning(f"Could not copy ACLs (may not be owner): {e}")
1209
+
1210
+ if copy_annotations:
1211
+ dst_catalog.put('/annotation', json=src_model.annotations)
1212
+
1213
+ # Build model content
1214
+ new_model = []
1215
+ clone_states = {}
1216
+ fkeys_deferred = []
1217
+ fkeys_pruned = 0
1218
+
1219
+ def prune_parts(d, *extra_victims):
1220
+ victims = set(extra_victims)
1221
+ if not copy_annotations:
1222
+ victims |= {'annotations'}
1223
+ if not copy_policy:
1224
+ victims |= {'acls', 'acl_bindings'}
1225
+ for k in victims:
1226
+ d.pop(k, None)
1227
+ return d
1228
+
1229
+ def copy_sdef(s):
1230
+ d = prune_parts(s.prejson(), 'tables')
1231
+ return d
1232
+
1233
+ def copy_tdef_core(t):
1234
+ d = prune_parts(t.prejson(), 'foreign_keys')
1235
+ d['column_definitions'] = [prune_parts(c) for c in d['column_definitions']]
1236
+ d['keys'] = [prune_parts(k) for k in d.get('keys', [])]
1237
+ d.setdefault('annotations', {})[_clone_state_url] = 1 if copy_data else None
1238
+ return d
1239
+
1240
+ def should_prune_fkey(fkdef, src_table):
1241
+ """Check if FK should be pruned due to hidden data."""
1242
+ if not prune_hidden_fkeys:
1243
+ return False
1244
+
1245
+ # Check if referenced columns have "select": null
1246
+ for ref_col in fkdef.get('referenced_columns', []):
1247
+ ref_schema = ref_col.get('schema_name')
1248
+ ref_table = ref_col.get('table_name')
1249
+ ref_col_name = ref_col.get('column_name')
1250
+
1251
+ if ref_schema and ref_table and ref_col_name:
1252
+ try:
1253
+ ref_table_obj = src_model.schemas[ref_schema].tables[ref_table]
1254
+ col_obj = ref_table_obj.column_definitions[ref_col_name]
1255
+ # Check column rights
1256
+ rights = getattr(col_obj, 'rights', None)
1257
+ if rights and rights.get('select') is None:
1258
+ return True
1259
+ except (KeyError, AttributeError):
1260
+ pass
1261
+ return False
1262
+
1263
+ def copy_tdef_fkeys(t, sname, tname):
1264
+ """Extract FKs, optionally pruning those with hidden references."""
1265
+ nonlocal fkeys_pruned
1266
+ fkeys = []
1267
+ for fkdef in t.prejson().get('foreign_keys', []):
1268
+ # Skip FKs to system tables
1269
+ skip = False
1270
+ for ref_col in fkdef.get('referenced_columns', []):
1271
+ if ref_col.get('schema_name') == 'public' \
1272
+ and ref_col.get('table_name') in {'ERMrest_Client', 'ERMrest_Group', 'ERMrest_RID_Lease'}:
1273
+ skip = True
1274
+ break
1275
+
1276
+ if skip:
1277
+ continue
1278
+
1279
+ if should_prune_fkey(fkdef, t):
1280
+ fkeys_pruned += 1
1281
+ fk_name = fkdef.get('names', [[sname, 'unknown']])[0]
1282
+ report.add_issue(CloneIssue(
1283
+ severity=CloneIssueSeverity.WARNING,
1284
+ category=CloneIssueCategory.FK_PRUNED,
1285
+ message=f"FK pruned due to hidden reference data",
1286
+ table=f"{sname}:{tname}",
1287
+ details=f"FK {fk_name} references columns with 'select': null",
1288
+ action="Source catalog may have incoherent policies",
1289
+ ))
1290
+ continue
1291
+
1292
+ fkeys.append(prune_parts(fkdef.copy()))
1293
+ return fkeys
1294
+
1295
+ # Collect schemas and tables
1296
+ for sname, schema in src_model.schemas.items():
1297
+ if sname in exclude_schemas:
1298
+ continue
1299
+
1300
+ new_model.append(copy_sdef(schema))
1301
+
1302
+ for tname, table in schema.tables.items():
1303
+ if (sname, tname) in excluded_tables:
1304
+ report.tables_skipped.append(f"{sname}:{tname}")
1305
+ continue
1306
+
1307
+ if table.kind != 'table':
1308
+ continue
1309
+
1310
+ if 'RID' not in table.column_definitions.elements:
1311
+ logger.warning(f"Table {sname}.{tname} lacks system columns, skipping")
1312
+ report.tables_skipped.append(f"{sname}:{tname}")
1313
+ continue
1314
+
1315
+ new_model.append(copy_tdef_core(table))
1316
+ clone_states[(sname, tname)] = 1 if copy_data else None
1317
+
1318
+ # Collect FKs for deferred application
1319
+ table_fkeys = copy_tdef_fkeys(table, sname, tname)
1320
+ for fk in table_fkeys:
1321
+ fkeys_deferred.append((sname, tname, fk))
1322
+
1323
+ # Stage 1: Apply schema without FKs
1324
+ logger.info("Stage 1: Creating schema without foreign keys...")
1325
+ if new_model:
1326
+ dst_catalog.post("/schema", json=new_model)
1327
+
1328
+ # Stage 2: Copy data
1329
+ total_rows = 0
1330
+ total_rows_skipped = 0
1331
+ all_truncated_values: list[TruncatedValue] = []
1332
+ deferred_indexes: dict[str, list[dict]] = {} # Track indexes dropped for later rebuild
1333
+
1334
+ if copy_data:
1335
+ logger.info("Stage 2: Copying data...")
1336
+ page_size = 10000
1337
+
1338
+ for (sname, tname), state in clone_states.items():
1339
+ if state != 1:
1340
+ continue
1341
+
1342
+ table_key = f"{sname}:{tname}"
1343
+ logger.debug(f"Copying data for {table_key}")
1344
+
1345
+ # Use the new copy function with index error handling
1346
+ table_rows, rows_skipped, truncated = _copy_table_data_with_retry(
1347
+ src_catalog=src_catalog,
1348
+ dst_catalog=dst_catalog,
1349
+ sname=sname,
1350
+ tname=tname,
1351
+ page_size=page_size,
1352
+ report=report,
1353
+ deferred_indexes=deferred_indexes,
1354
+ truncate_oversized=truncate_oversized,
1355
+ )
1356
+
1357
+ total_rows_skipped += rows_skipped
1358
+ all_truncated_values.extend(truncated)
1359
+
1360
+ if table_rows < 0:
1361
+ # Copy failed
1362
+ report.tables_failed.append(table_key)
1363
+ else:
1364
+ report.tables_restored[table_key] = table_rows
1365
+ total_rows += table_rows
1366
+
1367
+ # Mark complete
1368
+ try:
1369
+ dst_catalog.put(
1370
+ f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/annotation/{urlquote(_clone_state_url)}",
1371
+ json=2
1372
+ )
1373
+ except Exception:
1374
+ pass
1375
+
1376
+ logger.info(f"Stage 2 complete: {total_rows} rows copied")
1377
+
1378
+ # Rebuild any indexes that were dropped during data copy
1379
+ if deferred_indexes:
1380
+ _rebuild_deferred_indexes(dst_catalog, deferred_indexes, report)
1381
+
1382
+ # Stage 3: Apply foreign keys
1383
+ logger.info("Stage 3: Applying foreign keys...")
1384
+ orphan_rows_removed = 0
1385
+ orphan_rows_nullified = 0
1386
+
1387
+ if orphan_strategy == OrphanStrategy.DELETE:
1388
+ # For DELETE strategy, we use a three-phase approach:
1389
+ # Phase 1: Identify all FK violations without applying FKs yet
1390
+ # Phase 2: Delete orphan rows in dependency order (leaf tables first)
1391
+ # Phase 3: Apply all FKs
1392
+ # This ensures deletions aren't blocked by already-applied FKs.
1393
+
1394
+ # Phase 1: Identify orphan values for each FK
1395
+ logger.info("Phase 1: Identifying orphan values...")
1396
+ fk_orphans: list[tuple[str, str, dict, set]] = [] # (sname, tname, fk, orphan_values)
1397
+
1398
+ for sname, tname, fk in fkeys_deferred:
1399
+ orphan_values = _identify_orphan_values(dst_catalog, sname, tname, fk)
1400
+ if orphan_values:
1401
+ fk_orphans.append((sname, tname, fk, orphan_values))
1402
+ logger.info(f"Found {len(orphan_values)} orphan values in {sname}:{tname}")
1403
+
1404
+ # Phase 2: Delete orphan rows in dependency order
1405
+ # We need to delete from "leaf" tables first (tables that reference others
1406
+ # but are not referenced themselves), then work our way up
1407
+ if fk_orphans:
1408
+ logger.info("Phase 2: Deleting orphan rows...")
1409
+
1410
+ # Build a map of which tables have orphans and which tables they reference
1411
+ tables_with_orphans: set[tuple[str, str]] = set()
1412
+ table_references: dict[tuple[str, str], set[tuple[str, str]]] = {}
1413
+
1414
+ for sname, tname, fk, orphan_values in fk_orphans:
1415
+ table_key = (sname, tname)
1416
+ tables_with_orphans.add(table_key)
1417
+ if table_key not in table_references:
1418
+ table_references[table_key] = set()
1419
+ for ref_col in fk.get('referenced_columns', []):
1420
+ ref_key = (ref_col.get('schema_name'), ref_col.get('table_name'))
1421
+ if ref_key[0] and ref_key[1]:
1422
+ table_references[table_key].add(ref_key)
1423
+
1424
+ # Also track which tables have FKs pointing TO them
1425
+ referenced_by: dict[tuple[str, str], set[tuple[str, str]]] = {}
1426
+ for sname, tname, fk in fkeys_deferred:
1427
+ for ref_col in fk.get('referenced_columns', []):
1428
+ ref_key = (ref_col.get('schema_name'), ref_col.get('table_name'))
1429
+ if ref_key[0] and ref_key[1]:
1430
+ if ref_key not in referenced_by:
1431
+ referenced_by[ref_key] = set()
1432
+ referenced_by[ref_key].add((sname, tname))
1433
+
1434
+ # Process deletions in waves with cascading orphan detection
1435
+ # After each wave of deletions, we may have created new orphans in
1436
+ # tables that referenced the deleted rows
1437
+ max_waves = 20
1438
+ all_processed_fks: set[tuple[str, str, str]] = set() # (schema, table, fk_name)
1439
+
1440
+ for wave in range(max_waves):
1441
+ # Re-identify orphans for all FKs not yet fully processed
1442
+ current_orphans: list[tuple[str, str, dict, set]] = []
1443
+
1444
+ for sname, tname, fk in fkeys_deferred:
1445
+ fk_names = fk.get('names', [])
1446
+ fk_id = (sname, tname, str(fk_names))
1447
+ if fk_id in all_processed_fks:
1448
+ continue
1449
+
1450
+ orphan_values = _identify_orphan_values(dst_catalog, sname, tname, fk)
1451
+ if orphan_values:
1452
+ current_orphans.append((sname, tname, fk, orphan_values))
1453
+
1454
+ if not current_orphans:
1455
+ logger.info(f"Deletion wave {wave + 1}: no more orphans found")
1456
+ break
1457
+
1458
+ logger.info(f"Deletion wave {wave + 1}: processing {len(current_orphans)} FKs with orphans")
1459
+
1460
+ # Delete orphans for each FK
1461
+ wave_deleted = 0
1462
+ for sname, tname, fk, orphan_values in current_orphans:
1463
+ removed, nullified = _delete_orphan_rows(
1464
+ dst_catalog, sname, tname, fk, orphan_values, report
1465
+ )
1466
+ orphan_rows_removed += removed
1467
+ orphan_rows_nullified += nullified
1468
+ wave_deleted += removed
1469
+
1470
+ # Mark this FK as processed if we deleted all orphans
1471
+ if removed == len(orphan_values):
1472
+ fk_names = fk.get('names', [])
1473
+ fk_id = (sname, tname, str(fk_names))
1474
+ all_processed_fks.add(fk_id)
1475
+
1476
+ if wave_deleted == 0:
1477
+ # No deletions in this wave - might be stuck
1478
+ logger.warning(f"Deletion wave {wave + 1}: no rows deleted, may have circular dependencies")
1479
+ break
1480
+
1481
+ # Phase 3: Apply all FKs
1482
+ logger.info("Phase 3: Applying foreign keys...")
1483
+ failed_fks = []
1484
+
1485
+ for sname, tname, fk in fkeys_deferred:
1486
+ try:
1487
+ dst_catalog.post("/schema", json=[fk])
1488
+ report.fkeys_applied += 1
1489
+ except Exception as e:
1490
+ failed_fks.append((sname, tname, fk, str(e)))
1491
+
1492
+ # Retry failed FKs with additional orphan cleanup
1493
+ for retry_round in range(10):
1494
+ if not failed_fks:
1495
+ break
1496
+
1497
+ logger.info(f"FK retry round {retry_round + 1}: {len(failed_fks)} FKs still failing")
1498
+
1499
+ # Try to clean up any remaining orphans for failed FKs
1500
+ for sname, tname, fk, last_error in failed_fks:
1501
+ orphan_values = _identify_orphan_values(dst_catalog, sname, tname, fk)
1502
+ if orphan_values:
1503
+ removed, nullified = _delete_orphan_rows(
1504
+ dst_catalog, sname, tname, fk, orphan_values, report
1505
+ )
1506
+ orphan_rows_removed += removed
1507
+ orphan_rows_nullified += nullified
1508
+
1509
+ # Try to apply the failed FKs
1510
+ still_failed = []
1511
+ for sname, tname, fk, last_error in failed_fks:
1512
+ try:
1513
+ dst_catalog.post("/schema", json=[fk])
1514
+ report.fkeys_applied += 1
1515
+ except Exception as e:
1516
+ still_failed.append((sname, tname, fk, str(e)))
1517
+
1518
+ if len(still_failed) == len(failed_fks):
1519
+ # No progress - stop retrying
1520
+ logger.warning(f"FK retry round {retry_round + 1}: no progress, stopping retries")
1521
+ break
1522
+
1523
+ failed_fks = still_failed
1524
+
1525
+ # Record final failures
1526
+ for sname, tname, fk, error_msg in failed_fks:
1527
+ report.fkeys_failed += 1
1528
+ report.add_issue(CloneIssue(
1529
+ severity=CloneIssueSeverity.ERROR,
1530
+ category=CloneIssueCategory.FK_VIOLATION,
1531
+ message="FK still failing after handling orphans",
1532
+ table=f"{sname}:{tname}",
1533
+ details=error_msg[:500],
1534
+ ))
1535
+
1536
+ else:
1537
+ # For NULLIFY or FAIL strategies, use the simpler single-pass approach
1538
+ for sname, tname, fk in fkeys_deferred:
1539
+ try:
1540
+ dst_catalog.post("/schema", json=[fk])
1541
+ report.fkeys_applied += 1
1542
+ except Exception as e:
1543
+ error_msg = str(e)
1544
+
1545
+ if orphan_strategy == OrphanStrategy.FAIL:
1546
+ report.fkeys_failed += 1
1547
+ report.add_issue(CloneIssue(
1548
+ severity=CloneIssueSeverity.ERROR,
1549
+ category=CloneIssueCategory.FK_VIOLATION,
1550
+ message="FK constraint failed",
1551
+ table=f"{sname}:{tname}",
1552
+ details=error_msg[:500],
1553
+ action="Use orphan_strategy=DELETE or NULLIFY to handle",
1554
+ ))
1555
+ continue
1556
+
1557
+ # NULLIFY strategy
1558
+ removed, nullified = _handle_fk_violation(
1559
+ dst_catalog, sname, tname, fk, orphan_strategy, report
1560
+ )
1561
+ orphan_rows_removed += removed
1562
+ orphan_rows_nullified += nullified
1563
+
1564
+ # Retry FK application
1565
+ try:
1566
+ dst_catalog.post("/schema", json=[fk])
1567
+ report.fkeys_applied += 1
1568
+ except Exception as retry_error:
1569
+ report.fkeys_failed += 1
1570
+ report.add_issue(CloneIssue(
1571
+ severity=CloneIssueSeverity.ERROR,
1572
+ category=CloneIssueCategory.FK_VIOLATION,
1573
+ message="FK still failing after nullifying orphans",
1574
+ table=f"{sname}:{tname}",
1575
+ details=str(retry_error)[:500],
1576
+ ))
1577
+
1578
+ report.fkeys_pruned = fkeys_pruned
1579
+
1580
+ # Stage 3b: Copy configuration
1581
+ if copy_annotations or copy_policy:
1582
+ _copy_configuration(src_model, dst_catalog, copy_annotations, copy_policy, exclude_schemas, excluded_tables)
1583
+
1584
+ return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_truncated_values
1585
+
1586
+
1587
+ def _identify_orphan_values(
1588
+ dst_catalog: ErmrestCatalog,
1589
+ sname: str,
1590
+ tname: str,
1591
+ fk_def: dict,
1592
+ ) -> set:
1593
+ """Identify orphan FK values without deleting them.
1594
+
1595
+ Returns: Set of values that exist in the FK column but not in the referenced table.
1596
+ """
1597
+ fk_columns = fk_def.get('foreign_key_columns', [])
1598
+ ref_columns = fk_def.get('referenced_columns', [])
1599
+
1600
+ if not fk_columns or not ref_columns:
1601
+ return set()
1602
+
1603
+ src_col = fk_columns[0].get('column_name')
1604
+ ref_schema = ref_columns[0].get('schema_name')
1605
+ ref_table = ref_columns[0].get('table_name')
1606
+ ref_col = ref_columns[0].get('column_name')
1607
+
1608
+ if not all([src_col, ref_schema, ref_table, ref_col]):
1609
+ return set()
1610
+
1611
+ src_uri = f"{urlquote(sname)}:{urlquote(tname)}"
1612
+ ref_uri = f"{urlquote(ref_schema)}:{urlquote(ref_table)}"
1613
+
1614
+ try:
1615
+ src_values = dst_catalog.get(
1616
+ f"/attributegroup/{src_uri}/{urlquote(src_col)}"
1617
+ ).json()
1618
+ src_value_set = {row[src_col] for row in src_values if row.get(src_col) is not None}
1619
+ except Exception as e:
1620
+ logger.error(f"Failed to get source values for {sname}:{tname}.{src_col}: {e}")
1621
+ return set()
1622
+
1623
+ try:
1624
+ ref_values = dst_catalog.get(
1625
+ f"/attributegroup/{ref_uri}/{urlquote(ref_col)}"
1626
+ ).json()
1627
+ ref_value_set = {row[ref_col] for row in ref_values if row.get(ref_col) is not None}
1628
+ except Exception as e:
1629
+ logger.error(f"Failed to get reference values for {ref_schema}:{ref_table}.{ref_col}: {e}")
1630
+ return set()
1631
+
1632
+ return src_value_set - ref_value_set
1633
+
1634
+
1635
+ def _delete_orphan_rows(
1636
+ dst_catalog: ErmrestCatalog,
1637
+ sname: str,
1638
+ tname: str,
1639
+ fk_def: dict,
1640
+ orphan_values: set,
1641
+ report: CloneReport,
1642
+ ) -> tuple[int, int]:
1643
+ """Delete rows with orphan FK values.
1644
+
1645
+ Returns: (rows_removed, rows_nullified)
1646
+ """
1647
+ fk_columns = fk_def.get('foreign_key_columns', [])
1648
+ ref_columns = fk_def.get('referenced_columns', [])
1649
+
1650
+ if not fk_columns or not ref_columns:
1651
+ return 0, 0
1652
+
1653
+ src_col = fk_columns[0].get('column_name')
1654
+ ref_schema = ref_columns[0].get('schema_name')
1655
+ ref_table = ref_columns[0].get('table_name')
1656
+
1657
+ src_uri = f"{urlquote(sname)}:{urlquote(tname)}"
1658
+
1659
+ rows_removed = 0
1660
+ for value in orphan_values:
1661
+ encoded_value = urlquote(str(value), safe='') if isinstance(value, str) else str(value)
1662
+ try:
1663
+ dst_catalog.delete(f"/entity/{src_uri}/{urlquote(src_col)}={encoded_value}")
1664
+ rows_removed += 1
1665
+ except Exception as e:
1666
+ # Log but don't fail - the row might have been deleted by a previous operation
1667
+ # or might be blocked by another FK that will be handled later
1668
+ logger.debug(f"Could not delete {sname}:{tname} where {src_col}={value}: {e}")
1669
+
1670
+ # Record in report
1671
+ if rows_removed > 0:
1672
+ table_key = f"{sname}:{tname}"
1673
+ if table_key not in report.orphan_details:
1674
+ report.orphan_details[table_key] = {
1675
+ "rows_removed": 0,
1676
+ "rows_nullified": 0,
1677
+ "missing_references": {},
1678
+ }
1679
+
1680
+ report.orphan_details[table_key]["rows_removed"] += rows_removed
1681
+ ref_key = f"{ref_schema}:{ref_table}"
1682
+ report.orphan_details[table_key]["missing_references"][ref_key] = len(orphan_values)
1683
+
1684
+ report.add_issue(CloneIssue(
1685
+ severity=CloneIssueSeverity.WARNING,
1686
+ category=CloneIssueCategory.ORPHAN_ROWS,
1687
+ message=f"Orphan rows deleted",
1688
+ table=table_key,
1689
+ details=f"Missing references to: {ref_key} ({len(orphan_values)})",
1690
+ action="Source catalog may have incoherent row-level policies",
1691
+ row_count=rows_removed,
1692
+ ))
1693
+
1694
+ return rows_removed, 0
1695
+
1696
+
1697
+ def _handle_fk_violation(
1698
+ dst_catalog: ErmrestCatalog,
1699
+ sname: str,
1700
+ tname: str,
1701
+ fk_def: dict,
1702
+ strategy: OrphanStrategy,
1703
+ report: CloneReport,
1704
+ ) -> tuple[int, int]:
1705
+ """Handle FK violation by deleting or nullifying orphan rows.
1706
+
1707
+ Returns: (rows_removed, rows_nullified)
1708
+ """
1709
+ fk_columns = fk_def.get('foreign_key_columns', [])
1710
+ ref_columns = fk_def.get('referenced_columns', [])
1711
+
1712
+ if not fk_columns or not ref_columns:
1713
+ return 0, 0
1714
+
1715
+ src_col = fk_columns[0].get('column_name')
1716
+ ref_schema = ref_columns[0].get('schema_name')
1717
+ ref_table = ref_columns[0].get('table_name')
1718
+ ref_col = ref_columns[0].get('column_name')
1719
+
1720
+ if not all([src_col, ref_schema, ref_table, ref_col]):
1721
+ return 0, 0
1722
+
1723
+ src_uri = f"{urlquote(sname)}:{urlquote(tname)}"
1724
+ ref_uri = f"{urlquote(ref_schema)}:{urlquote(ref_table)}"
1725
+
1726
+ # Find orphan values
1727
+ try:
1728
+ src_values = dst_catalog.get(
1729
+ f"/attributegroup/{src_uri}/{urlquote(src_col)}"
1730
+ ).json()
1731
+ src_value_set = {row[src_col] for row in src_values if row.get(src_col) is not None}
1732
+ except Exception as e:
1733
+ logger.error(f"Failed to get source values: {e}")
1734
+ return 0, 0
1735
+
1736
+ try:
1737
+ ref_values = dst_catalog.get(
1738
+ f"/attributegroup/{ref_uri}/{urlquote(ref_col)}"
1739
+ ).json()
1740
+ ref_value_set = {row[ref_col] for row in ref_values if row.get(ref_col) is not None}
1741
+ except Exception as e:
1742
+ logger.error(f"Failed to get reference values: {e}")
1743
+ return 0, 0
1744
+
1745
+ orphan_values = src_value_set - ref_value_set
1746
+
1747
+ if not orphan_values:
1748
+ return 0, 0
1749
+
1750
+ logger.info(f"Found {len(orphan_values)} orphan values in {sname}:{tname}.{src_col}")
1751
+
1752
+ rows_removed = 0
1753
+ rows_nullified = 0
1754
+
1755
+ for value in orphan_values:
1756
+ encoded_value = urlquote(str(value), safe='') if isinstance(value, str) else str(value)
1757
+
1758
+ if strategy == OrphanStrategy.DELETE:
1759
+ try:
1760
+ dst_catalog.delete(f"/entity/{src_uri}/{urlquote(src_col)}={encoded_value}")
1761
+ rows_removed += 1
1762
+ except Exception as e:
1763
+ logger.warning(f"Failed to delete orphans for {src_col}={value}: {e}")
1764
+ elif strategy == OrphanStrategy.NULLIFY:
1765
+ try:
1766
+ # Set FK column to NULL for orphan rows
1767
+ dst_catalog.put(
1768
+ f"/attributegroup/{src_uri}/{urlquote(src_col)}={encoded_value}/{urlquote(src_col)}",
1769
+ json=None
1770
+ )
1771
+ rows_nullified += 1
1772
+ except Exception as e:
1773
+ logger.warning(f"Failed to nullify {src_col}={value}: {e}")
1774
+
1775
+ # Record in report
1776
+ table_key = f"{sname}:{tname}"
1777
+ if table_key not in report.orphan_details:
1778
+ report.orphan_details[table_key] = {
1779
+ "rows_removed": 0,
1780
+ "rows_nullified": 0,
1781
+ "missing_references": {},
1782
+ }
1783
+
1784
+ report.orphan_details[table_key]["rows_removed"] += rows_removed
1785
+ report.orphan_details[table_key]["rows_nullified"] += rows_nullified
1786
+ ref_key = f"{ref_schema}:{ref_table}"
1787
+ report.orphan_details[table_key]["missing_references"][ref_key] = len(orphan_values)
1788
+
1789
+ action_taken = "deleted" if strategy == OrphanStrategy.DELETE else "nullified"
1790
+ report.add_issue(CloneIssue(
1791
+ severity=CloneIssueSeverity.WARNING,
1792
+ category=CloneIssueCategory.ORPHAN_ROWS,
1793
+ message=f"Orphan rows {action_taken}",
1794
+ table=table_key,
1795
+ details=f"Missing references to: {ref_key} ({len(orphan_values)})",
1796
+ action="Source catalog may have incoherent row-level policies",
1797
+ row_count=rows_removed + rows_nullified,
1798
+ ))
1799
+
1800
+ return rows_removed, rows_nullified
1801
+
1802
+
1803
+ def _copy_configuration(
1804
+ src_model,
1805
+ dst_catalog: ErmrestCatalog,
1806
+ copy_annotations: bool,
1807
+ copy_policy: bool,
1808
+ exclude_schemas: list[str],
1809
+ excluded_tables: set[tuple[str, str]],
1810
+ ) -> None:
1811
+ """Copy annotations and policies after FK application."""
1812
+ dst_model = dst_catalog.getCatalogModel()
1813
+
1814
+ for sname, src_schema in src_model.schemas.items():
1815
+ if sname in exclude_schemas or sname not in dst_model.schemas:
1816
+ continue
1817
+
1818
+ dst_schema = dst_model.schemas[sname]
1819
+
1820
+ if copy_annotations:
1821
+ for k, v in src_schema.annotations.items():
1822
+ if k != _clone_state_url:
1823
+ dst_schema.annotations[k] = v
1824
+
1825
+ if copy_policy:
1826
+ if hasattr(dst_schema, 'acls') and hasattr(src_schema, 'acls'):
1827
+ dst_schema.acls.update(src_schema.acls)
1828
+ if hasattr(dst_schema, 'acl_bindings') and hasattr(src_schema, 'acl_bindings'):
1829
+ dst_schema.acl_bindings.update(src_schema.acl_bindings)
1830
+
1831
+ for tname, src_table in src_schema.tables.items():
1832
+ if (sname, tname) in excluded_tables or tname not in dst_schema.tables:
1833
+ continue
1834
+
1835
+ dst_table = dst_schema.tables[tname]
1836
+
1837
+ if copy_annotations:
1838
+ for k, v in src_table.annotations.items():
1839
+ if k != _clone_state_url:
1840
+ dst_table.annotations[k] = v
1841
+
1842
+ if copy_policy:
1843
+ if hasattr(dst_table, 'acls') and hasattr(src_table, 'acls'):
1844
+ dst_table.acls.update(src_table.acls)
1845
+ if hasattr(dst_table, 'acl_bindings') and hasattr(src_table, 'acl_bindings'):
1846
+ dst_table.acl_bindings.update(src_table.acl_bindings)
1847
+
1848
+ try:
1849
+ dst_model.apply()
1850
+ except Exception as e:
1851
+ logger.warning(f"Failed to apply some configuration: {e}")
1852
+
1853
+
1854
+ def _get_catalog_snapshot(
1855
+ hostname: str,
1856
+ catalog_id: str,
1857
+ credential: dict | None,
1858
+ ) -> str | None:
1859
+ """Get the current snapshot ID for a catalog."""
1860
+ try:
1861
+ cred = credential or get_credential(hostname)
1862
+ server = DerivaServer("https", hostname, credentials=cred)
1863
+ catalog = server.connect_ermrest(catalog_id)
1864
+ response = catalog.get("/")
1865
+ if response.status_code == 200:
1866
+ data = response.json()
1867
+ return data.get("snaptime")
1868
+ except Exception as e:
1869
+ logger.warning(f"Could not get catalog snapshot: {e}")
1870
+ return None
1871
+
1872
+
1873
+ def _post_clone_operations(
1874
+ result: CloneCatalogResult,
1875
+ alias: str | None,
1876
+ add_ml_schema: bool,
1877
+ credential: dict | None,
1878
+ ) -> CloneCatalogResult:
1879
+ """Perform post-clone operations."""
1880
+ cred = credential or get_credential(result.hostname)
1881
+ server = DerivaServer("https", result.hostname, credentials=cred)
1882
+
1883
+ if alias:
1884
+ try:
1885
+ server.post(
1886
+ f"/ermrest/catalog/{result.catalog_id}/alias/{urlquote(alias)}",
1887
+ json={}
1888
+ )
1889
+ result.alias = alias
1890
+ except Exception as e:
1891
+ logger.warning(f"Failed to create alias '{alias}': {e}")
1892
+
1893
+ if add_ml_schema:
1894
+ try:
1895
+ from deriva_ml.schema import add_ml_schema as add_schema
1896
+ catalog = server.connect_ermrest(result.catalog_id)
1897
+ add_schema(catalog)
1898
+ result.ml_schema_added = True
1899
+ except Exception as e:
1900
+ logger.warning(f"Failed to add ML schema: {e}")
1901
+
1902
+ return result
1903
+
1904
+
1905
+ def _reinitialize_dataset_versions(
1906
+ result: CloneCatalogResult,
1907
+ credential: dict | None,
1908
+ ) -> CloneCatalogResult:
1909
+ """Reinitialize dataset versions after cloning."""
1910
+ try:
1911
+ cred = credential or get_credential(result.hostname)
1912
+ server = DerivaServer("https", result.hostname, credentials=cred)
1913
+ catalog = server.connect_ermrest(result.catalog_id)
1914
+
1915
+ model = catalog.getCatalogModel()
1916
+ if "deriva-ml" not in model.schemas:
1917
+ return result
1918
+
1919
+ datasets = catalog.get("/entity/deriva-ml:Dataset").json()
1920
+
1921
+ for dataset in datasets:
1922
+ try:
1923
+ rid = dataset["RID"]
1924
+ catalog.post(
1925
+ "/entity/deriva-ml:Dataset_Version",
1926
+ json=[{
1927
+ "Dataset": rid,
1928
+ "Version": "0.0.1",
1929
+ "Description": f"Cloned from {result.source_hostname}:{result.source_catalog_id}",
1930
+ }]
1931
+ )
1932
+ result.datasets_reinitialized += 1
1933
+ except Exception as e:
1934
+ logger.warning(f"Failed to reinitialize version for dataset {rid}: {e}")
1935
+
1936
+ except Exception as e:
1937
+ logger.warning(f"Failed to reinitialize dataset versions: {e}")
1938
+
1939
+ return result