deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +43 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +21 -0
  7. deriva_ml/catalog/clone.py +1199 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +817 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +186 -105
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +545 -244
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +224 -35
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
  67. deriva_ml-1.17.11.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.9.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1199 @@
1
+ """Enhanced catalog cloning with cross-server and selective asset support.
2
+
3
+ This module provides catalog cloning that handles the common case of incoherent
4
+ row-level policies in source catalogs. When source policies hide some domain
5
+ table rows but don't hide the referring rows, foreign key violations occur
6
+ during cloning.
7
+
8
+ The solution uses a three-stage approach:
9
+ 1. Create schema WITHOUT foreign keys
10
+ 2. Copy all data
11
+ 3. Apply foreign keys, handling violations by either:
12
+ - Deleting orphan rows (rows with dangling FK references)
13
+ - Nullifying references (setting dangling FK values to NULL)
14
+
15
+ This approach is more robust than trying to pre-filter data, as it handles
16
+ all edge cases including circular dependencies and complex FK relationships.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import logging
22
+ from dataclasses import dataclass, field
23
+ from enum import Enum
24
+ from typing import Any
25
+ from urllib.parse import quote as urlquote
26
+
27
+ from deriva.core import DerivaServer, ErmrestCatalog, get_credential
28
+
29
+ logger = logging.getLogger("deriva_ml")
30
+
31
+
32
+ class CloneIssueSeverity(Enum):
33
+ """Severity level of clone issues."""
34
+
35
+ INFO = "info"
36
+ WARNING = "warning"
37
+ ERROR = "error"
38
+ CRITICAL = "critical"
39
+
40
+
41
+ class CloneIssueCategory(Enum):
42
+ """Category of clone issues."""
43
+
44
+ ACCESS_DENIED = "access_denied"
45
+ ORPHAN_ROWS = "orphan_rows"
46
+ DATA_INTEGRITY = "data_integrity"
47
+ SCHEMA_ISSUE = "schema_issue"
48
+ RESTORE_FAILURE = "restore_failure"
49
+ FK_VIOLATION = "fk_violation"
50
+ FK_PRUNED = "fk_pruned" # FK was intentionally not applied
51
+ POLICY_INCOHERENCE = "policy_incoherence"
52
+
53
+
54
+ class OrphanStrategy(Enum):
55
+ """Strategy for handling orphan rows (rows with dangling FK references).
56
+
57
+ When cloning a catalog with incoherent row-level policies, some rows may
58
+ reference parent rows that are hidden from the cloning user. These orphan
59
+ rows would violate FK constraints.
60
+ """
61
+
62
+ FAIL = "fail" # Fail the clone if FK violations occur
63
+ DELETE = "delete" # Delete rows with dangling references
64
+ NULLIFY = "nullify" # Set dangling FK values to NULL (requires nullok)
65
+
66
+
67
+ class AssetCopyMode(Enum):
68
+ """How to handle assets during catalog cloning."""
69
+
70
+ NONE = "none" # Don't copy assets
71
+ REFERENCES = "refs" # Keep URLs pointing to source
72
+ FULL = "full" # Download and re-upload assets
73
+
74
+
75
+ @dataclass
76
+ class CloneIssue:
77
+ """A single issue encountered during catalog cloning."""
78
+
79
+ severity: CloneIssueSeverity
80
+ category: CloneIssueCategory
81
+ message: str
82
+ table: str | None = None
83
+ details: str | None = None
84
+ action: str | None = None
85
+ row_count: int = 0
86
+
87
+ def to_dict(self) -> dict[str, Any]:
88
+ return {
89
+ "severity": self.severity.value,
90
+ "category": self.category.value,
91
+ "message": self.message,
92
+ "table": self.table,
93
+ "details": self.details,
94
+ "action": self.action,
95
+ "row_count": self.row_count,
96
+ }
97
+
98
+ def __str__(self) -> str:
99
+ parts = [f"[{self.severity.value.upper()}]"]
100
+ if self.table:
101
+ parts.append(f"{self.table}:")
102
+ parts.append(self.message)
103
+ if self.row_count > 0:
104
+ parts.append(f"({self.row_count} rows)")
105
+ return " ".join(parts)
106
+
107
+
108
+ @dataclass
109
+ class CloneReport:
110
+ """Comprehensive report of catalog clone operation.
111
+
112
+ Tracks all issues encountered during cloning, including:
113
+ - Policy incoherence issues (FK violations due to hidden data)
114
+ - Orphan rows that were deleted or nullified
115
+ - FKs that were pruned or failed
116
+ - Tables that were restored, failed, or skipped
117
+
118
+ Provides both JSON and text output formats for reporting.
119
+ """
120
+
121
+ issues: list[CloneIssue] = field(default_factory=list)
122
+ tables_restored: dict[str, int] = field(default_factory=dict)
123
+ tables_failed: list[str] = field(default_factory=list)
124
+ tables_skipped: list[str] = field(default_factory=list)
125
+ orphan_details: dict[str, dict] = field(default_factory=dict)
126
+ fkeys_applied: int = 0
127
+ fkeys_failed: int = 0
128
+ fkeys_pruned: int = 0
129
+
130
+ def add_issue(self, issue: CloneIssue) -> None:
131
+ self.issues.append(issue)
132
+
133
+ def to_dict(self) -> dict[str, Any]:
134
+ """Return the report as a JSON-serializable dictionary."""
135
+ return {
136
+ "summary": {
137
+ "total_issues": len(self.issues),
138
+ "errors": len([i for i in self.issues if i.severity == CloneIssueSeverity.ERROR]),
139
+ "warnings": len([i for i in self.issues if i.severity == CloneIssueSeverity.WARNING]),
140
+ "tables_restored": len(self.tables_restored),
141
+ "tables_failed": len(self.tables_failed),
142
+ "tables_skipped": len(self.tables_skipped),
143
+ "total_rows_restored": sum(self.tables_restored.values()),
144
+ "orphan_rows_removed": sum(
145
+ d.get("rows_removed", 0) for d in self.orphan_details.values()
146
+ ),
147
+ "orphan_rows_nullified": sum(
148
+ d.get("rows_nullified", 0) for d in self.orphan_details.values()
149
+ ),
150
+ "fkeys_applied": self.fkeys_applied,
151
+ "fkeys_failed": self.fkeys_failed,
152
+ "fkeys_pruned": self.fkeys_pruned,
153
+ },
154
+ "issues": [i.to_dict() for i in self.issues],
155
+ "tables_restored": self.tables_restored,
156
+ "tables_failed": self.tables_failed,
157
+ "tables_skipped": self.tables_skipped,
158
+ "orphan_details": self.orphan_details,
159
+ }
160
+
161
+ def to_json(self, indent: int = 2) -> str:
162
+ """Return the report as a formatted JSON string."""
163
+ import json
164
+ return json.dumps(self.to_dict(), indent=indent)
165
+
166
+ def to_text(self) -> str:
167
+ """Return the report as human-readable text."""
168
+ lines = []
169
+ lines.append("=" * 70)
170
+ lines.append("CATALOG CLONE REPORT")
171
+ lines.append("=" * 70)
172
+ lines.append("")
173
+
174
+ # Summary
175
+ summary = self.to_dict()["summary"]
176
+ lines.append("SUMMARY")
177
+ lines.append("-" * 40)
178
+ lines.append(f"Tables restored: {summary['tables_restored']}")
179
+ lines.append(f"Tables failed: {summary['tables_failed']}")
180
+ lines.append(f"Tables skipped: {summary['tables_skipped']}")
181
+ lines.append(f"Total rows restored: {summary['total_rows_restored']}")
182
+ lines.append(f"Orphan rows removed: {summary['orphan_rows_removed']}")
183
+ lines.append(f"Orphan rows nullified: {summary['orphan_rows_nullified']}")
184
+ lines.append(f"FKs applied: {summary['fkeys_applied']}")
185
+ lines.append(f"FKs failed: {summary['fkeys_failed']}")
186
+ lines.append(f"FKs pruned: {summary['fkeys_pruned']}")
187
+ lines.append(f"Errors: {summary['errors']}")
188
+ lines.append(f"Warnings: {summary['warnings']}")
189
+ lines.append("")
190
+
191
+ # Issues by severity
192
+ if self.issues:
193
+ lines.append("ISSUES")
194
+ lines.append("-" * 40)
195
+
196
+ # Group by severity
197
+ for severity in [CloneIssueSeverity.CRITICAL, CloneIssueSeverity.ERROR,
198
+ CloneIssueSeverity.WARNING, CloneIssueSeverity.INFO]:
199
+ severity_issues = [i for i in self.issues if i.severity == severity]
200
+ if severity_issues:
201
+ lines.append(f"\n{severity.value.upper()} ({len(severity_issues)}):")
202
+ for issue in severity_issues:
203
+ lines.append(f" - {issue}")
204
+ if issue.details:
205
+ # Truncate long details
206
+ details = issue.details[:100] + "..." if len(issue.details) > 100 else issue.details
207
+ lines.append(f" Details: {details}")
208
+ if issue.action:
209
+ lines.append(f" Action: {issue.action}")
210
+ lines.append("")
211
+
212
+ # Orphan details
213
+ if self.orphan_details:
214
+ lines.append("ORPHAN ROW DETAILS")
215
+ lines.append("-" * 40)
216
+ for table, details in self.orphan_details.items():
217
+ removed = details.get("rows_removed", 0)
218
+ nullified = details.get("rows_nullified", 0)
219
+ lines.append(f" {table}:")
220
+ if removed > 0:
221
+ lines.append(f" Rows deleted: {removed}")
222
+ if nullified > 0:
223
+ lines.append(f" Rows nullified: {nullified}")
224
+ missing = details.get("missing_references", {})
225
+ for ref_table, count in missing.items():
226
+ lines.append(f" -> missing references to {ref_table}: {count}")
227
+ lines.append("")
228
+
229
+ # Assessment
230
+ lines.append("CLONE ASSESSMENT")
231
+ lines.append("-" * 40)
232
+ if summary['errors'] > 0:
233
+ lines.append("Clone completed with ERRORS. Some FKs could not be applied.")
234
+ lines.append("The catalog schema may be degraded.")
235
+ elif summary['orphan_rows_removed'] > 0 or summary['orphan_rows_nullified'] > 0:
236
+ lines.append("Clone completed with orphan handling.")
237
+ lines.append("Source catalog may have incoherent row-level policies.")
238
+ elif summary['fkeys_pruned'] > 0:
239
+ lines.append("Clone completed with pruned FKs.")
240
+ lines.append("Some FK constraints were skipped due to hidden reference data.")
241
+ else:
242
+ lines.append("Clone completed successfully.")
243
+ lines.append("")
244
+ lines.append("=" * 70)
245
+
246
+ return "\n".join(lines)
247
+
248
+ def __str__(self) -> str:
249
+ """Return text representation of the report."""
250
+ return self.to_text()
251
+
252
+
253
+ @dataclass
254
+ class AssetFilter:
255
+ """Filter for selecting which assets to copy during cloning."""
256
+
257
+ tables: list[str] | None = None
258
+ rids: list[str] | None = None
259
+
260
+
261
+ @dataclass
262
+ class CloneCatalogResult:
263
+ """Result of a catalog clone operation."""
264
+
265
+ catalog_id: str
266
+ hostname: str
267
+ schema_only: bool
268
+ asset_mode: AssetCopyMode
269
+ source_hostname: str
270
+ source_catalog_id: str
271
+ source_snapshot: str | None = None
272
+ alias: str | None = None
273
+ ml_schema_added: bool = False
274
+ datasets_reinitialized: int = 0
275
+ orphan_rows_removed: int = 0
276
+ orphan_rows_nullified: int = 0
277
+ fkeys_pruned: int = 0
278
+ report: CloneReport | None = None
279
+
280
+
281
+ # Clone state annotation URL (same as deriva-py)
282
+ _clone_state_url = "tag:isrd.isi.edu,2018:clone-state"
283
+
284
+
285
+ def clone_catalog(
286
+ source_hostname: str,
287
+ source_catalog_id: str,
288
+ dest_hostname: str | None = None,
289
+ alias: str | None = None,
290
+ add_ml_schema: bool = False,
291
+ schema_only: bool = False,
292
+ asset_mode: AssetCopyMode = AssetCopyMode.REFERENCES,
293
+ asset_filter: AssetFilter | None = None,
294
+ copy_annotations: bool = True,
295
+ copy_policy: bool = True,
296
+ exclude_schemas: list[str] | None = None,
297
+ exclude_objects: list[str] | None = None,
298
+ source_credential: dict | None = None,
299
+ dest_credential: dict | None = None,
300
+ reinitialize_dataset_versions: bool = True,
301
+ orphan_strategy: OrphanStrategy = OrphanStrategy.FAIL,
302
+ prune_hidden_fkeys: bool = False,
303
+ ) -> CloneCatalogResult:
304
+ """Clone a catalog with robust handling of policy-induced FK violations.
305
+
306
+ This function handles the common case where source catalog policies are
307
+ incoherent - some domain tables have row-level policies hiding data, but
308
+ referring tables don't have matching policies, leading to visible references
309
+ to invisible rows.
310
+
311
+ Uses a three-stage approach:
312
+ 1. Create schema WITHOUT foreign keys
313
+ 2. Copy all accessible data
314
+ 3. Apply foreign keys, handling violations based on orphan_strategy
315
+
316
+ Args:
317
+ source_hostname: Hostname of the source catalog server.
318
+ source_catalog_id: ID of the catalog to clone.
319
+ dest_hostname: Destination hostname. If None, clones to same server.
320
+ alias: Optional alias name for the new catalog.
321
+ add_ml_schema: If True, add the DerivaML schema to the clone.
322
+ schema_only: If True, copy only schema structure without data.
323
+ asset_mode: How to handle assets during cloning.
324
+ asset_filter: Optional filter to selectively copy assets.
325
+ copy_annotations: If True (default), copy all catalog annotations.
326
+ copy_policy: If True (default), copy ACL policies (requires ownership).
327
+ exclude_schemas: List of schema names to exclude from cloning.
328
+ exclude_objects: List of specific tables to exclude ("schema:table").
329
+ source_credential: Optional credential dict for source server.
330
+ dest_credential: Optional credential dict for destination server.
331
+ reinitialize_dataset_versions: If True, reset dataset versions for clone.
332
+ orphan_strategy: How to handle rows with dangling FK references:
333
+ - FAIL: Abort if FK violations occur (default)
334
+ - DELETE: Delete orphan rows
335
+ - NULLIFY: Set dangling FK values to NULL
336
+ prune_hidden_fkeys: If True, skip FKs where referenced columns have
337
+ "select": null rights (indicating potentially hidden data). This
338
+ prevents FK violations but degrades schema structure.
339
+
340
+ Returns:
341
+ CloneCatalogResult with details of the cloned catalog.
342
+
343
+ Raises:
344
+ ValueError: If invalid parameters or FK violations with FAIL strategy.
345
+
346
+ Example:
347
+ >>> # Clone with orphan deletion
348
+ >>> result = clone_catalog(
349
+ ... "source.org", "21",
350
+ ... dest_hostname="localhost",
351
+ ... orphan_strategy=OrphanStrategy.DELETE,
352
+ ... )
353
+
354
+ >>> # Conservative clone that prunes problematic FKs
355
+ >>> result = clone_catalog(
356
+ ... "source.org", "21",
357
+ ... dest_hostname="localhost",
358
+ ... prune_hidden_fkeys=True,
359
+ ... )
360
+ """
361
+ # Determine destination
362
+ is_same_server = dest_hostname is None or dest_hostname == source_hostname
363
+ effective_dest_hostname = source_hostname if dest_hostname is None else dest_hostname
364
+
365
+ # Get source snapshot for provenance
366
+ source_snapshot = _get_catalog_snapshot(
367
+ source_hostname, source_catalog_id, source_credential
368
+ )
369
+
370
+ # Connect to source
371
+ src_cred = source_credential or get_credential(source_hostname)
372
+ src_server = DerivaServer("https", source_hostname, credentials=src_cred)
373
+ src_catalog = src_server.connect_ermrest(source_catalog_id)
374
+
375
+ # Connect to destination and create new catalog
376
+ if is_same_server:
377
+ dst_cred = src_cred
378
+ dst_server = src_server
379
+ else:
380
+ dst_cred = dest_credential or get_credential(effective_dest_hostname)
381
+ dst_server = DerivaServer("https", effective_dest_hostname, credentials=dst_cred)
382
+
383
+ dst_catalog = dst_server.create_ermrest_catalog(
384
+ name=f"Clone of {source_catalog_id}",
385
+ description=f"Cloned from {source_hostname}:{source_catalog_id}",
386
+ )
387
+
388
+ report = CloneReport()
389
+
390
+ # Perform the three-stage clone
391
+ orphan_rows_removed, orphan_rows_nullified, fkeys_pruned = _clone_three_stage(
392
+ src_catalog=src_catalog,
393
+ dst_catalog=dst_catalog,
394
+ copy_data=not schema_only,
395
+ copy_annotations=copy_annotations,
396
+ copy_policy=copy_policy,
397
+ exclude_schemas=exclude_schemas or [],
398
+ exclude_objects=exclude_objects or [],
399
+ orphan_strategy=orphan_strategy,
400
+ prune_hidden_fkeys=prune_hidden_fkeys,
401
+ report=report,
402
+ )
403
+
404
+ result = CloneCatalogResult(
405
+ catalog_id=str(dst_catalog.catalog_id),
406
+ hostname=effective_dest_hostname,
407
+ schema_only=schema_only,
408
+ asset_mode=asset_mode,
409
+ source_hostname=source_hostname,
410
+ source_catalog_id=source_catalog_id,
411
+ source_snapshot=source_snapshot,
412
+ orphan_rows_removed=orphan_rows_removed,
413
+ orphan_rows_nullified=orphan_rows_nullified,
414
+ fkeys_pruned=fkeys_pruned,
415
+ report=report,
416
+ )
417
+
418
+ # Post-clone operations
419
+ result = _post_clone_operations(
420
+ result=result,
421
+ alias=alias,
422
+ add_ml_schema=add_ml_schema,
423
+ credential=dst_cred,
424
+ )
425
+
426
+ if reinitialize_dataset_versions and not schema_only:
427
+ result = _reinitialize_dataset_versions(
428
+ result=result,
429
+ credential=dst_cred,
430
+ )
431
+
432
+ return result
433
+
434
+
435
+ def _clone_three_stage(
436
+ src_catalog: ErmrestCatalog,
437
+ dst_catalog: ErmrestCatalog,
438
+ copy_data: bool,
439
+ copy_annotations: bool,
440
+ copy_policy: bool,
441
+ exclude_schemas: list[str],
442
+ exclude_objects: list[str],
443
+ orphan_strategy: OrphanStrategy,
444
+ prune_hidden_fkeys: bool,
445
+ report: CloneReport,
446
+ ) -> tuple[int, int, int]:
447
+ """Perform three-stage catalog cloning.
448
+
449
+ Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned)
450
+ """
451
+ src_model = src_catalog.getCatalogModel()
452
+
453
+ # Parse exclude_objects
454
+ excluded_tables: set[tuple[str, str]] = set()
455
+ for obj in exclude_objects:
456
+ if ":" in obj:
457
+ schema, table = obj.split(":", 1)
458
+ excluded_tables.add((schema, table))
459
+
460
+ # Set top-level config
461
+ if copy_policy and src_model.acls:
462
+ try:
463
+ dst_catalog.put('/acl', json=src_model.acls)
464
+ except Exception as e:
465
+ logger.warning(f"Could not copy ACLs (may not be owner): {e}")
466
+
467
+ if copy_annotations:
468
+ dst_catalog.put('/annotation', json=src_model.annotations)
469
+
470
+ # Build model content
471
+ new_model = []
472
+ clone_states = {}
473
+ fkeys_deferred = []
474
+ fkeys_pruned = 0
475
+
476
+ def prune_parts(d, *extra_victims):
477
+ victims = set(extra_victims)
478
+ if not copy_annotations:
479
+ victims |= {'annotations'}
480
+ if not copy_policy:
481
+ victims |= {'acls', 'acl_bindings'}
482
+ for k in victims:
483
+ d.pop(k, None)
484
+ return d
485
+
486
+ def copy_sdef(s):
487
+ d = prune_parts(s.prejson(), 'tables')
488
+ return d
489
+
490
+ def copy_tdef_core(t):
491
+ d = prune_parts(t.prejson(), 'foreign_keys')
492
+ d['column_definitions'] = [prune_parts(c) for c in d['column_definitions']]
493
+ d['keys'] = [prune_parts(k) for k in d.get('keys', [])]
494
+ d.setdefault('annotations', {})[_clone_state_url] = 1 if copy_data else None
495
+ return d
496
+
497
+ def should_prune_fkey(fkdef, src_table):
498
+ """Check if FK should be pruned due to hidden data."""
499
+ if not prune_hidden_fkeys:
500
+ return False
501
+
502
+ # Check if referenced columns have "select": null
503
+ for ref_col in fkdef.get('referenced_columns', []):
504
+ ref_schema = ref_col.get('schema_name')
505
+ ref_table = ref_col.get('table_name')
506
+ ref_col_name = ref_col.get('column_name')
507
+
508
+ if ref_schema and ref_table and ref_col_name:
509
+ try:
510
+ ref_table_obj = src_model.schemas[ref_schema].tables[ref_table]
511
+ col_obj = ref_table_obj.column_definitions[ref_col_name]
512
+ # Check column rights
513
+ rights = getattr(col_obj, 'rights', None)
514
+ if rights and rights.get('select') is None:
515
+ return True
516
+ except (KeyError, AttributeError):
517
+ pass
518
+ return False
519
+
520
+ def copy_tdef_fkeys(t, sname, tname):
521
+ """Extract FKs, optionally pruning those with hidden references."""
522
+ nonlocal fkeys_pruned
523
+ fkeys = []
524
+ for fkdef in t.prejson().get('foreign_keys', []):
525
+ # Skip FKs to system tables
526
+ skip = False
527
+ for ref_col in fkdef.get('referenced_columns', []):
528
+ if ref_col.get('schema_name') == 'public' \
529
+ and ref_col.get('table_name') in {'ERMrest_Client', 'ERMrest_Group', 'ERMrest_RID_Lease'}:
530
+ skip = True
531
+ break
532
+
533
+ if skip:
534
+ continue
535
+
536
+ if should_prune_fkey(fkdef, t):
537
+ fkeys_pruned += 1
538
+ fk_name = fkdef.get('names', [[sname, 'unknown']])[0]
539
+ report.add_issue(CloneIssue(
540
+ severity=CloneIssueSeverity.WARNING,
541
+ category=CloneIssueCategory.FK_PRUNED,
542
+ message=f"FK pruned due to hidden reference data",
543
+ table=f"{sname}:{tname}",
544
+ details=f"FK {fk_name} references columns with 'select': null",
545
+ action="Source catalog may have incoherent policies",
546
+ ))
547
+ continue
548
+
549
+ fkeys.append(prune_parts(fkdef.copy()))
550
+ return fkeys
551
+
552
+ # Collect schemas and tables
553
+ for sname, schema in src_model.schemas.items():
554
+ if sname in exclude_schemas:
555
+ continue
556
+
557
+ new_model.append(copy_sdef(schema))
558
+
559
+ for tname, table in schema.tables.items():
560
+ if (sname, tname) in excluded_tables:
561
+ report.tables_skipped.append(f"{sname}:{tname}")
562
+ continue
563
+
564
+ if table.kind != 'table':
565
+ continue
566
+
567
+ if 'RID' not in table.column_definitions.elements:
568
+ logger.warning(f"Table {sname}.{tname} lacks system columns, skipping")
569
+ report.tables_skipped.append(f"{sname}:{tname}")
570
+ continue
571
+
572
+ new_model.append(copy_tdef_core(table))
573
+ clone_states[(sname, tname)] = 1 if copy_data else None
574
+
575
+ # Collect FKs for deferred application
576
+ table_fkeys = copy_tdef_fkeys(table, sname, tname)
577
+ for fk in table_fkeys:
578
+ fkeys_deferred.append((sname, tname, fk))
579
+
580
+ # Stage 1: Apply schema without FKs
581
+ logger.info("Stage 1: Creating schema without foreign keys...")
582
+ if new_model:
583
+ dst_catalog.post("/schema", json=new_model)
584
+
585
+ # Stage 2: Copy data
586
+ total_rows = 0
587
+ if copy_data:
588
+ logger.info("Stage 2: Copying data...")
589
+ page_size = 10000
590
+
591
+ for (sname, tname), state in clone_states.items():
592
+ if state != 1:
593
+ continue
594
+
595
+ tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
596
+ logger.debug(f"Copying data for {sname}:{tname}")
597
+
598
+ last = None
599
+ table_rows = 0
600
+
601
+ while True:
602
+ after_clause = f"@after({urlquote(last)})" if last else ""
603
+ try:
604
+ page = src_catalog.get(
605
+ f"/entity/{tname_uri}@sort(RID){after_clause}?limit={page_size}"
606
+ ).json()
607
+ except Exception as e:
608
+ logger.warning(f"Failed to read from {sname}:{tname}: {e}")
609
+ report.tables_failed.append(f"{sname}:{tname}")
610
+ break
611
+
612
+ if page:
613
+ try:
614
+ dst_catalog.post(
615
+ f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
616
+ json=page
617
+ )
618
+ last = page[-1]['RID']
619
+ table_rows += len(page)
620
+ except Exception as e:
621
+ logger.warning(f"Failed to write to {sname}:{tname}: {e}")
622
+ report.tables_failed.append(f"{sname}:{tname}")
623
+ break
624
+ else:
625
+ break
626
+
627
+ if f"{sname}:{tname}" not in report.tables_failed:
628
+ report.tables_restored[f"{sname}:{tname}"] = table_rows
629
+ total_rows += table_rows
630
+
631
+ # Mark complete
632
+ try:
633
+ dst_catalog.put(
634
+ f"/schema/{urlquote(sname)}/table/{urlquote(tname)}/annotation/{urlquote(_clone_state_url)}",
635
+ json=2
636
+ )
637
+ except Exception:
638
+ pass
639
+
640
+ logger.info(f"Stage 2 complete: {total_rows} rows copied")
641
+
642
+ # Stage 3: Apply foreign keys
643
+ logger.info("Stage 3: Applying foreign keys...")
644
+ orphan_rows_removed = 0
645
+ orphan_rows_nullified = 0
646
+
647
+ if orphan_strategy == OrphanStrategy.DELETE:
648
+ # For DELETE strategy, we use a three-phase approach:
649
+ # Phase 1: Identify all FK violations without applying FKs yet
650
+ # Phase 2: Delete orphan rows in dependency order (leaf tables first)
651
+ # Phase 3: Apply all FKs
652
+ # This ensures deletions aren't blocked by already-applied FKs.
653
+
654
+ # Phase 1: Identify orphan values for each FK
655
+ logger.info("Phase 1: Identifying orphan values...")
656
+ fk_orphans: list[tuple[str, str, dict, set]] = [] # (sname, tname, fk, orphan_values)
657
+
658
+ for sname, tname, fk in fkeys_deferred:
659
+ orphan_values = _identify_orphan_values(dst_catalog, sname, tname, fk)
660
+ if orphan_values:
661
+ fk_orphans.append((sname, tname, fk, orphan_values))
662
+ logger.info(f"Found {len(orphan_values)} orphan values in {sname}:{tname}")
663
+
664
+ # Phase 2: Delete orphan rows in dependency order
665
+ # We need to delete from "leaf" tables first (tables that reference others
666
+ # but are not referenced themselves), then work our way up
667
+ if fk_orphans:
668
+ logger.info("Phase 2: Deleting orphan rows...")
669
+
670
+ # Build a map of which tables have orphans and which tables they reference
671
+ tables_with_orphans: set[tuple[str, str]] = set()
672
+ table_references: dict[tuple[str, str], set[tuple[str, str]]] = {}
673
+
674
+ for sname, tname, fk, orphan_values in fk_orphans:
675
+ table_key = (sname, tname)
676
+ tables_with_orphans.add(table_key)
677
+ if table_key not in table_references:
678
+ table_references[table_key] = set()
679
+ for ref_col in fk.get('referenced_columns', []):
680
+ ref_key = (ref_col.get('schema_name'), ref_col.get('table_name'))
681
+ if ref_key[0] and ref_key[1]:
682
+ table_references[table_key].add(ref_key)
683
+
684
+ # Also track which tables have FKs pointing TO them
685
+ referenced_by: dict[tuple[str, str], set[tuple[str, str]]] = {}
686
+ for sname, tname, fk in fkeys_deferred:
687
+ for ref_col in fk.get('referenced_columns', []):
688
+ ref_key = (ref_col.get('schema_name'), ref_col.get('table_name'))
689
+ if ref_key[0] and ref_key[1]:
690
+ if ref_key not in referenced_by:
691
+ referenced_by[ref_key] = set()
692
+ referenced_by[ref_key].add((sname, tname))
693
+
694
+ # Process deletions in waves with cascading orphan detection
695
+ # After each wave of deletions, we may have created new orphans in
696
+ # tables that referenced the deleted rows
697
+ max_waves = 20
698
+ all_processed_fks: set[tuple[str, str, str]] = set() # (schema, table, fk_name)
699
+
700
+ for wave in range(max_waves):
701
+ # Re-identify orphans for all FKs not yet fully processed
702
+ current_orphans: list[tuple[str, str, dict, set]] = []
703
+
704
+ for sname, tname, fk in fkeys_deferred:
705
+ fk_names = fk.get('names', [])
706
+ fk_id = (sname, tname, str(fk_names))
707
+ if fk_id in all_processed_fks:
708
+ continue
709
+
710
+ orphan_values = _identify_orphan_values(dst_catalog, sname, tname, fk)
711
+ if orphan_values:
712
+ current_orphans.append((sname, tname, fk, orphan_values))
713
+
714
+ if not current_orphans:
715
+ logger.info(f"Deletion wave {wave + 1}: no more orphans found")
716
+ break
717
+
718
+ logger.info(f"Deletion wave {wave + 1}: processing {len(current_orphans)} FKs with orphans")
719
+
720
+ # Delete orphans for each FK
721
+ wave_deleted = 0
722
+ for sname, tname, fk, orphan_values in current_orphans:
723
+ removed, nullified = _delete_orphan_rows(
724
+ dst_catalog, sname, tname, fk, orphan_values, report
725
+ )
726
+ orphan_rows_removed += removed
727
+ orphan_rows_nullified += nullified
728
+ wave_deleted += removed
729
+
730
+ # Mark this FK as processed if we deleted all orphans
731
+ if removed == len(orphan_values):
732
+ fk_names = fk.get('names', [])
733
+ fk_id = (sname, tname, str(fk_names))
734
+ all_processed_fks.add(fk_id)
735
+
736
+ if wave_deleted == 0:
737
+ # No deletions in this wave - might be stuck
738
+ logger.warning(f"Deletion wave {wave + 1}: no rows deleted, may have circular dependencies")
739
+ break
740
+
741
+ # Phase 3: Apply all FKs
742
+ logger.info("Phase 3: Applying foreign keys...")
743
+ failed_fks = []
744
+
745
+ for sname, tname, fk in fkeys_deferred:
746
+ try:
747
+ dst_catalog.post("/schema", json=[fk])
748
+ report.fkeys_applied += 1
749
+ except Exception as e:
750
+ failed_fks.append((sname, tname, fk, str(e)))
751
+
752
+ # Retry failed FKs with additional orphan cleanup
753
+ for retry_round in range(10):
754
+ if not failed_fks:
755
+ break
756
+
757
+ logger.info(f"FK retry round {retry_round + 1}: {len(failed_fks)} FKs still failing")
758
+
759
+ # Try to clean up any remaining orphans for failed FKs
760
+ for sname, tname, fk, last_error in failed_fks:
761
+ orphan_values = _identify_orphan_values(dst_catalog, sname, tname, fk)
762
+ if orphan_values:
763
+ removed, nullified = _delete_orphan_rows(
764
+ dst_catalog, sname, tname, fk, orphan_values, report
765
+ )
766
+ orphan_rows_removed += removed
767
+ orphan_rows_nullified += nullified
768
+
769
+ # Try to apply the failed FKs
770
+ still_failed = []
771
+ for sname, tname, fk, last_error in failed_fks:
772
+ try:
773
+ dst_catalog.post("/schema", json=[fk])
774
+ report.fkeys_applied += 1
775
+ except Exception as e:
776
+ still_failed.append((sname, tname, fk, str(e)))
777
+
778
+ if len(still_failed) == len(failed_fks):
779
+ # No progress - stop retrying
780
+ logger.warning(f"FK retry round {retry_round + 1}: no progress, stopping retries")
781
+ break
782
+
783
+ failed_fks = still_failed
784
+
785
+ # Record final failures
786
+ for sname, tname, fk, error_msg in failed_fks:
787
+ report.fkeys_failed += 1
788
+ report.add_issue(CloneIssue(
789
+ severity=CloneIssueSeverity.ERROR,
790
+ category=CloneIssueCategory.FK_VIOLATION,
791
+ message="FK still failing after handling orphans",
792
+ table=f"{sname}:{tname}",
793
+ details=error_msg[:500],
794
+ ))
795
+
796
+ else:
797
+ # For NULLIFY or FAIL strategies, use the simpler single-pass approach
798
+ for sname, tname, fk in fkeys_deferred:
799
+ try:
800
+ dst_catalog.post("/schema", json=[fk])
801
+ report.fkeys_applied += 1
802
+ except Exception as e:
803
+ error_msg = str(e)
804
+
805
+ if orphan_strategy == OrphanStrategy.FAIL:
806
+ report.fkeys_failed += 1
807
+ report.add_issue(CloneIssue(
808
+ severity=CloneIssueSeverity.ERROR,
809
+ category=CloneIssueCategory.FK_VIOLATION,
810
+ message="FK constraint failed",
811
+ table=f"{sname}:{tname}",
812
+ details=error_msg[:500],
813
+ action="Use orphan_strategy=DELETE or NULLIFY to handle",
814
+ ))
815
+ continue
816
+
817
+ # NULLIFY strategy
818
+ removed, nullified = _handle_fk_violation(
819
+ dst_catalog, sname, tname, fk, orphan_strategy, report
820
+ )
821
+ orphan_rows_removed += removed
822
+ orphan_rows_nullified += nullified
823
+
824
+ # Retry FK application
825
+ try:
826
+ dst_catalog.post("/schema", json=[fk])
827
+ report.fkeys_applied += 1
828
+ except Exception as retry_error:
829
+ report.fkeys_failed += 1
830
+ report.add_issue(CloneIssue(
831
+ severity=CloneIssueSeverity.ERROR,
832
+ category=CloneIssueCategory.FK_VIOLATION,
833
+ message="FK still failing after nullifying orphans",
834
+ table=f"{sname}:{tname}",
835
+ details=str(retry_error)[:500],
836
+ ))
837
+
838
+ report.fkeys_pruned = fkeys_pruned
839
+
840
+ # Stage 3b: Copy configuration
841
+ if copy_annotations or copy_policy:
842
+ _copy_configuration(src_model, dst_catalog, copy_annotations, copy_policy, exclude_schemas, excluded_tables)
843
+
844
+ return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned
845
+
846
+
847
+ def _identify_orphan_values(
848
+ dst_catalog: ErmrestCatalog,
849
+ sname: str,
850
+ tname: str,
851
+ fk_def: dict,
852
+ ) -> set:
853
+ """Identify orphan FK values without deleting them.
854
+
855
+ Returns: Set of values that exist in the FK column but not in the referenced table.
856
+ """
857
+ fk_columns = fk_def.get('foreign_key_columns', [])
858
+ ref_columns = fk_def.get('referenced_columns', [])
859
+
860
+ if not fk_columns or not ref_columns:
861
+ return set()
862
+
863
+ src_col = fk_columns[0].get('column_name')
864
+ ref_schema = ref_columns[0].get('schema_name')
865
+ ref_table = ref_columns[0].get('table_name')
866
+ ref_col = ref_columns[0].get('column_name')
867
+
868
+ if not all([src_col, ref_schema, ref_table, ref_col]):
869
+ return set()
870
+
871
+ src_uri = f"{urlquote(sname)}:{urlquote(tname)}"
872
+ ref_uri = f"{urlquote(ref_schema)}:{urlquote(ref_table)}"
873
+
874
+ try:
875
+ src_values = dst_catalog.get(
876
+ f"/attributegroup/{src_uri}/{urlquote(src_col)}"
877
+ ).json()
878
+ src_value_set = {row[src_col] for row in src_values if row.get(src_col) is not None}
879
+ except Exception as e:
880
+ logger.error(f"Failed to get source values for {sname}:{tname}.{src_col}: {e}")
881
+ return set()
882
+
883
+ try:
884
+ ref_values = dst_catalog.get(
885
+ f"/attributegroup/{ref_uri}/{urlquote(ref_col)}"
886
+ ).json()
887
+ ref_value_set = {row[ref_col] for row in ref_values if row.get(ref_col) is not None}
888
+ except Exception as e:
889
+ logger.error(f"Failed to get reference values for {ref_schema}:{ref_table}.{ref_col}: {e}")
890
+ return set()
891
+
892
+ return src_value_set - ref_value_set
893
+
894
+
895
+ def _delete_orphan_rows(
896
+ dst_catalog: ErmrestCatalog,
897
+ sname: str,
898
+ tname: str,
899
+ fk_def: dict,
900
+ orphan_values: set,
901
+ report: CloneReport,
902
+ ) -> tuple[int, int]:
903
+ """Delete rows with orphan FK values.
904
+
905
+ Returns: (rows_removed, rows_nullified)
906
+ """
907
+ fk_columns = fk_def.get('foreign_key_columns', [])
908
+ ref_columns = fk_def.get('referenced_columns', [])
909
+
910
+ if not fk_columns or not ref_columns:
911
+ return 0, 0
912
+
913
+ src_col = fk_columns[0].get('column_name')
914
+ ref_schema = ref_columns[0].get('schema_name')
915
+ ref_table = ref_columns[0].get('table_name')
916
+
917
+ src_uri = f"{urlquote(sname)}:{urlquote(tname)}"
918
+
919
+ rows_removed = 0
920
+ for value in orphan_values:
921
+ encoded_value = urlquote(str(value), safe='') if isinstance(value, str) else str(value)
922
+ try:
923
+ dst_catalog.delete(f"/entity/{src_uri}/{urlquote(src_col)}={encoded_value}")
924
+ rows_removed += 1
925
+ except Exception as e:
926
+ # Log but don't fail - the row might have been deleted by a previous operation
927
+ # or might be blocked by another FK that will be handled later
928
+ logger.debug(f"Could not delete {sname}:{tname} where {src_col}={value}: {e}")
929
+
930
+ # Record in report
931
+ if rows_removed > 0:
932
+ table_key = f"{sname}:{tname}"
933
+ if table_key not in report.orphan_details:
934
+ report.orphan_details[table_key] = {
935
+ "rows_removed": 0,
936
+ "rows_nullified": 0,
937
+ "missing_references": {},
938
+ }
939
+
940
+ report.orphan_details[table_key]["rows_removed"] += rows_removed
941
+ ref_key = f"{ref_schema}:{ref_table}"
942
+ report.orphan_details[table_key]["missing_references"][ref_key] = len(orphan_values)
943
+
944
+ report.add_issue(CloneIssue(
945
+ severity=CloneIssueSeverity.WARNING,
946
+ category=CloneIssueCategory.ORPHAN_ROWS,
947
+ message=f"Orphan rows deleted",
948
+ table=table_key,
949
+ details=f"Missing references to: {ref_key} ({len(orphan_values)})",
950
+ action="Source catalog may have incoherent row-level policies",
951
+ row_count=rows_removed,
952
+ ))
953
+
954
+ return rows_removed, 0
955
+
956
+
957
+ def _handle_fk_violation(
958
+ dst_catalog: ErmrestCatalog,
959
+ sname: str,
960
+ tname: str,
961
+ fk_def: dict,
962
+ strategy: OrphanStrategy,
963
+ report: CloneReport,
964
+ ) -> tuple[int, int]:
965
+ """Handle FK violation by deleting or nullifying orphan rows.
966
+
967
+ Returns: (rows_removed, rows_nullified)
968
+ """
969
+ fk_columns = fk_def.get('foreign_key_columns', [])
970
+ ref_columns = fk_def.get('referenced_columns', [])
971
+
972
+ if not fk_columns or not ref_columns:
973
+ return 0, 0
974
+
975
+ src_col = fk_columns[0].get('column_name')
976
+ ref_schema = ref_columns[0].get('schema_name')
977
+ ref_table = ref_columns[0].get('table_name')
978
+ ref_col = ref_columns[0].get('column_name')
979
+
980
+ if not all([src_col, ref_schema, ref_table, ref_col]):
981
+ return 0, 0
982
+
983
+ src_uri = f"{urlquote(sname)}:{urlquote(tname)}"
984
+ ref_uri = f"{urlquote(ref_schema)}:{urlquote(ref_table)}"
985
+
986
+ # Find orphan values
987
+ try:
988
+ src_values = dst_catalog.get(
989
+ f"/attributegroup/{src_uri}/{urlquote(src_col)}"
990
+ ).json()
991
+ src_value_set = {row[src_col] for row in src_values if row.get(src_col) is not None}
992
+ except Exception as e:
993
+ logger.error(f"Failed to get source values: {e}")
994
+ return 0, 0
995
+
996
+ try:
997
+ ref_values = dst_catalog.get(
998
+ f"/attributegroup/{ref_uri}/{urlquote(ref_col)}"
999
+ ).json()
1000
+ ref_value_set = {row[ref_col] for row in ref_values if row.get(ref_col) is not None}
1001
+ except Exception as e:
1002
+ logger.error(f"Failed to get reference values: {e}")
1003
+ return 0, 0
1004
+
1005
+ orphan_values = src_value_set - ref_value_set
1006
+
1007
+ if not orphan_values:
1008
+ return 0, 0
1009
+
1010
+ logger.info(f"Found {len(orphan_values)} orphan values in {sname}:{tname}.{src_col}")
1011
+
1012
+ rows_removed = 0
1013
+ rows_nullified = 0
1014
+
1015
+ for value in orphan_values:
1016
+ encoded_value = urlquote(str(value), safe='') if isinstance(value, str) else str(value)
1017
+
1018
+ if strategy == OrphanStrategy.DELETE:
1019
+ try:
1020
+ dst_catalog.delete(f"/entity/{src_uri}/{urlquote(src_col)}={encoded_value}")
1021
+ rows_removed += 1
1022
+ except Exception as e:
1023
+ logger.warning(f"Failed to delete orphans for {src_col}={value}: {e}")
1024
+ elif strategy == OrphanStrategy.NULLIFY:
1025
+ try:
1026
+ # Set FK column to NULL for orphan rows
1027
+ dst_catalog.put(
1028
+ f"/attributegroup/{src_uri}/{urlquote(src_col)}={encoded_value}/{urlquote(src_col)}",
1029
+ json=None
1030
+ )
1031
+ rows_nullified += 1
1032
+ except Exception as e:
1033
+ logger.warning(f"Failed to nullify {src_col}={value}: {e}")
1034
+
1035
+ # Record in report
1036
+ table_key = f"{sname}:{tname}"
1037
+ if table_key not in report.orphan_details:
1038
+ report.orphan_details[table_key] = {
1039
+ "rows_removed": 0,
1040
+ "rows_nullified": 0,
1041
+ "missing_references": {},
1042
+ }
1043
+
1044
+ report.orphan_details[table_key]["rows_removed"] += rows_removed
1045
+ report.orphan_details[table_key]["rows_nullified"] += rows_nullified
1046
+ ref_key = f"{ref_schema}:{ref_table}"
1047
+ report.orphan_details[table_key]["missing_references"][ref_key] = len(orphan_values)
1048
+
1049
+ action_taken = "deleted" if strategy == OrphanStrategy.DELETE else "nullified"
1050
+ report.add_issue(CloneIssue(
1051
+ severity=CloneIssueSeverity.WARNING,
1052
+ category=CloneIssueCategory.ORPHAN_ROWS,
1053
+ message=f"Orphan rows {action_taken}",
1054
+ table=table_key,
1055
+ details=f"Missing references to: {ref_key} ({len(orphan_values)})",
1056
+ action="Source catalog may have incoherent row-level policies",
1057
+ row_count=rows_removed + rows_nullified,
1058
+ ))
1059
+
1060
+ return rows_removed, rows_nullified
1061
+
1062
+
1063
+ def _copy_configuration(
1064
+ src_model,
1065
+ dst_catalog: ErmrestCatalog,
1066
+ copy_annotations: bool,
1067
+ copy_policy: bool,
1068
+ exclude_schemas: list[str],
1069
+ excluded_tables: set[tuple[str, str]],
1070
+ ) -> None:
1071
+ """Copy annotations and policies after FK application."""
1072
+ dst_model = dst_catalog.getCatalogModel()
1073
+
1074
+ for sname, src_schema in src_model.schemas.items():
1075
+ if sname in exclude_schemas or sname not in dst_model.schemas:
1076
+ continue
1077
+
1078
+ dst_schema = dst_model.schemas[sname]
1079
+
1080
+ if copy_annotations:
1081
+ for k, v in src_schema.annotations.items():
1082
+ if k != _clone_state_url:
1083
+ dst_schema.annotations[k] = v
1084
+
1085
+ if copy_policy:
1086
+ if hasattr(dst_schema, 'acls') and hasattr(src_schema, 'acls'):
1087
+ dst_schema.acls.update(src_schema.acls)
1088
+ if hasattr(dst_schema, 'acl_bindings') and hasattr(src_schema, 'acl_bindings'):
1089
+ dst_schema.acl_bindings.update(src_schema.acl_bindings)
1090
+
1091
+ for tname, src_table in src_schema.tables.items():
1092
+ if (sname, tname) in excluded_tables or tname not in dst_schema.tables:
1093
+ continue
1094
+
1095
+ dst_table = dst_schema.tables[tname]
1096
+
1097
+ if copy_annotations:
1098
+ for k, v in src_table.annotations.items():
1099
+ if k != _clone_state_url:
1100
+ dst_table.annotations[k] = v
1101
+
1102
+ if copy_policy:
1103
+ if hasattr(dst_table, 'acls') and hasattr(src_table, 'acls'):
1104
+ dst_table.acls.update(src_table.acls)
1105
+ if hasattr(dst_table, 'acl_bindings') and hasattr(src_table, 'acl_bindings'):
1106
+ dst_table.acl_bindings.update(src_table.acl_bindings)
1107
+
1108
+ try:
1109
+ dst_model.apply()
1110
+ except Exception as e:
1111
+ logger.warning(f"Failed to apply some configuration: {e}")
1112
+
1113
+
1114
+ def _get_catalog_snapshot(
1115
+ hostname: str,
1116
+ catalog_id: str,
1117
+ credential: dict | None,
1118
+ ) -> str | None:
1119
+ """Get the current snapshot ID for a catalog."""
1120
+ try:
1121
+ cred = credential or get_credential(hostname)
1122
+ server = DerivaServer("https", hostname, credentials=cred)
1123
+ catalog = server.connect_ermrest(catalog_id)
1124
+ response = catalog.get("/")
1125
+ if response.status_code == 200:
1126
+ data = response.json()
1127
+ return data.get("snaptime")
1128
+ except Exception as e:
1129
+ logger.warning(f"Could not get catalog snapshot: {e}")
1130
+ return None
1131
+
1132
+
1133
+ def _post_clone_operations(
1134
+ result: CloneCatalogResult,
1135
+ alias: str | None,
1136
+ add_ml_schema: bool,
1137
+ credential: dict | None,
1138
+ ) -> CloneCatalogResult:
1139
+ """Perform post-clone operations."""
1140
+ cred = credential or get_credential(result.hostname)
1141
+ server = DerivaServer("https", result.hostname, credentials=cred)
1142
+
1143
+ if alias:
1144
+ try:
1145
+ server.post(
1146
+ f"/ermrest/catalog/{result.catalog_id}/alias/{urlquote(alias)}",
1147
+ json={}
1148
+ )
1149
+ result.alias = alias
1150
+ except Exception as e:
1151
+ logger.warning(f"Failed to create alias '{alias}': {e}")
1152
+
1153
+ if add_ml_schema:
1154
+ try:
1155
+ from deriva_ml.schema import add_ml_schema as add_schema
1156
+ catalog = server.connect_ermrest(result.catalog_id)
1157
+ add_schema(catalog)
1158
+ result.ml_schema_added = True
1159
+ except Exception as e:
1160
+ logger.warning(f"Failed to add ML schema: {e}")
1161
+
1162
+ return result
1163
+
1164
+
1165
+ def _reinitialize_dataset_versions(
1166
+ result: CloneCatalogResult,
1167
+ credential: dict | None,
1168
+ ) -> CloneCatalogResult:
1169
+ """Reinitialize dataset versions after cloning."""
1170
+ try:
1171
+ cred = credential or get_credential(result.hostname)
1172
+ server = DerivaServer("https", result.hostname, credentials=cred)
1173
+ catalog = server.connect_ermrest(result.catalog_id)
1174
+
1175
+ model = catalog.getCatalogModel()
1176
+ if "deriva-ml" not in model.schemas:
1177
+ return result
1178
+
1179
+ datasets = catalog.get("/entity/deriva-ml:Dataset").json()
1180
+
1181
+ for dataset in datasets:
1182
+ try:
1183
+ rid = dataset["RID"]
1184
+ catalog.post(
1185
+ "/entity/deriva-ml:Dataset_Version",
1186
+ json=[{
1187
+ "Dataset": rid,
1188
+ "Version": "0.0.1",
1189
+ "Description": f"Cloned from {result.source_hostname}:{result.source_catalog_id}",
1190
+ }]
1191
+ )
1192
+ result.datasets_reinitialized += 1
1193
+ except Exception as e:
1194
+ logger.warning(f"Failed to reinitialize version for dataset {rid}: {e}")
1195
+
1196
+ except Exception as e:
1197
+ logger.warning(f"Failed to reinitialize dataset versions: {e}")
1198
+
1199
+ return result