deriva-ml 1.17.11__py3-none-any.whl → 1.17.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +26 -0
- deriva_ml/catalog/__init__.py +10 -0
- deriva_ml/catalog/clone.py +777 -37
- deriva_ml/core/base.py +28 -0
- {deriva_ml-1.17.11.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +1 -1
- {deriva_ml-1.17.11.dist-info → deriva_ml-1.17.12.dist-info}/RECORD +10 -10
- {deriva_ml-1.17.11.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +0 -0
- {deriva_ml-1.17.11.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.17.11.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.11.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
deriva_ml/__init__.py
CHANGED
|
@@ -75,6 +75,26 @@ def __getattr__(name: str) -> type:
|
|
|
75
75
|
from deriva_ml.schema.validation import validate_ml_schema
|
|
76
76
|
|
|
77
77
|
return validate_ml_schema
|
|
78
|
+
elif name == "CatalogProvenance":
|
|
79
|
+
from deriva_ml.catalog.clone import CatalogProvenance
|
|
80
|
+
|
|
81
|
+
return CatalogProvenance
|
|
82
|
+
elif name == "CatalogCreationMethod":
|
|
83
|
+
from deriva_ml.catalog.clone import CatalogCreationMethod
|
|
84
|
+
|
|
85
|
+
return CatalogCreationMethod
|
|
86
|
+
elif name == "CloneDetails":
|
|
87
|
+
from deriva_ml.catalog.clone import CloneDetails
|
|
88
|
+
|
|
89
|
+
return CloneDetails
|
|
90
|
+
elif name == "get_catalog_provenance":
|
|
91
|
+
from deriva_ml.catalog.clone import get_catalog_provenance
|
|
92
|
+
|
|
93
|
+
return get_catalog_provenance
|
|
94
|
+
elif name == "set_catalog_provenance":
|
|
95
|
+
from deriva_ml.catalog.clone import set_catalog_provenance
|
|
96
|
+
|
|
97
|
+
return set_catalog_provenance
|
|
78
98
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
79
99
|
|
|
80
100
|
|
|
@@ -90,6 +110,12 @@ __all__ = [
|
|
|
90
110
|
# Schema validation (lazy-loaded)
|
|
91
111
|
"SchemaValidationReport",
|
|
92
112
|
"validate_ml_schema",
|
|
113
|
+
# Catalog provenance (lazy-loaded)
|
|
114
|
+
"CatalogProvenance",
|
|
115
|
+
"CatalogCreationMethod",
|
|
116
|
+
"CloneDetails",
|
|
117
|
+
"get_catalog_provenance",
|
|
118
|
+
"set_catalog_provenance",
|
|
93
119
|
# Exceptions
|
|
94
120
|
"DerivaMLException",
|
|
95
121
|
"DerivaMLInvalidTerm",
|
deriva_ml/catalog/__init__.py
CHANGED
|
@@ -3,8 +3,13 @@
|
|
|
3
3
|
from deriva_ml.catalog.clone import (
|
|
4
4
|
AssetCopyMode,
|
|
5
5
|
AssetFilter,
|
|
6
|
+
CatalogCreationMethod,
|
|
7
|
+
CatalogProvenance,
|
|
6
8
|
CloneCatalogResult,
|
|
9
|
+
CloneDetails,
|
|
7
10
|
clone_catalog,
|
|
11
|
+
get_catalog_provenance,
|
|
12
|
+
set_catalog_provenance,
|
|
8
13
|
)
|
|
9
14
|
from deriva_ml.catalog.localize import (
|
|
10
15
|
LocalizeResult,
|
|
@@ -14,8 +19,13 @@ from deriva_ml.catalog.localize import (
|
|
|
14
19
|
__all__ = [
|
|
15
20
|
"AssetCopyMode",
|
|
16
21
|
"AssetFilter",
|
|
22
|
+
"CatalogCreationMethod",
|
|
23
|
+
"CatalogProvenance",
|
|
17
24
|
"CloneCatalogResult",
|
|
25
|
+
"CloneDetails",
|
|
18
26
|
"LocalizeResult",
|
|
19
27
|
"clone_catalog",
|
|
28
|
+
"get_catalog_provenance",
|
|
20
29
|
"localize_assets",
|
|
30
|
+
"set_catalog_provenance",
|
|
21
31
|
]
|
deriva_ml/catalog/clone.py
CHANGED
|
@@ -18,13 +18,16 @@ all edge cases including circular dependencies and complex FK relationships.
|
|
|
18
18
|
|
|
19
19
|
from __future__ import annotations
|
|
20
20
|
|
|
21
|
+
import json
|
|
21
22
|
import logging
|
|
22
23
|
from dataclasses import dataclass, field
|
|
24
|
+
from datetime import datetime, timezone
|
|
23
25
|
from enum import Enum
|
|
24
26
|
from typing import Any
|
|
25
27
|
from urllib.parse import quote as urlquote
|
|
26
28
|
|
|
27
29
|
from deriva.core import DerivaServer, ErmrestCatalog, get_credential
|
|
30
|
+
from deriva.core.hatrac_store import HatracStore
|
|
28
31
|
|
|
29
32
|
logger = logging.getLogger("deriva_ml")
|
|
30
33
|
|
|
@@ -49,6 +52,7 @@ class CloneIssueCategory(Enum):
|
|
|
49
52
|
FK_VIOLATION = "fk_violation"
|
|
50
53
|
FK_PRUNED = "fk_pruned" # FK was intentionally not applied
|
|
51
54
|
POLICY_INCOHERENCE = "policy_incoherence"
|
|
55
|
+
INDEX_REBUILT = "index_rebuilt" # Index was dropped and rebuilt due to size limits
|
|
52
56
|
|
|
53
57
|
|
|
54
58
|
class OrphanStrategy(Enum):
|
|
@@ -258,6 +262,26 @@ class AssetFilter:
|
|
|
258
262
|
rids: list[str] | None = None
|
|
259
263
|
|
|
260
264
|
|
|
265
|
+
@dataclass
|
|
266
|
+
class TruncatedValue:
|
|
267
|
+
"""Record of a value that was truncated during cloning."""
|
|
268
|
+
|
|
269
|
+
table: str
|
|
270
|
+
rid: str
|
|
271
|
+
column: str
|
|
272
|
+
original_bytes: int
|
|
273
|
+
truncated_bytes: int
|
|
274
|
+
|
|
275
|
+
def to_dict(self) -> dict[str, Any]:
|
|
276
|
+
return {
|
|
277
|
+
"table": self.table,
|
|
278
|
+
"rid": self.rid,
|
|
279
|
+
"column": self.column,
|
|
280
|
+
"original_bytes": self.original_bytes,
|
|
281
|
+
"truncated_bytes": self.truncated_bytes,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
|
|
261
285
|
@dataclass
|
|
262
286
|
class CloneCatalogResult:
|
|
263
287
|
"""Result of a catalog clone operation."""
|
|
@@ -275,12 +299,652 @@ class CloneCatalogResult:
|
|
|
275
299
|
orphan_rows_removed: int = 0
|
|
276
300
|
orphan_rows_nullified: int = 0
|
|
277
301
|
fkeys_pruned: int = 0
|
|
302
|
+
rows_skipped: int = 0
|
|
303
|
+
truncated_values: list[TruncatedValue] = field(default_factory=list)
|
|
278
304
|
report: CloneReport | None = None
|
|
279
305
|
|
|
280
306
|
|
|
281
307
|
# Clone state annotation URL (same as deriva-py)
|
|
282
308
|
_clone_state_url = "tag:isrd.isi.edu,2018:clone-state"
|
|
283
309
|
|
|
310
|
+
# Catalog provenance annotation URL
|
|
311
|
+
_catalog_provenance_url = "tag:deriva-ml.org,2025:catalog-provenance"
|
|
312
|
+
|
|
313
|
+
# Pattern to detect btree index size errors
|
|
314
|
+
_BTREE_INDEX_ERROR_PATTERN = "index row size"
|
|
315
|
+
_BTREE_INDEX_NAME_PATTERN = r'for index "([^"]+)"'
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
class CatalogCreationMethod(Enum):
|
|
319
|
+
"""How a catalog was created."""
|
|
320
|
+
|
|
321
|
+
CLONE = "clone" # Cloned from another catalog
|
|
322
|
+
CREATE = "create" # Created programmatically (e.g., create_catalog)
|
|
323
|
+
SCHEMA = "schema" # Created from schema definition
|
|
324
|
+
UNKNOWN = "unknown" # Unknown or pre-existing catalog
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@dataclass
|
|
328
|
+
class CloneDetails:
|
|
329
|
+
"""Details specific to cloned catalogs."""
|
|
330
|
+
|
|
331
|
+
source_hostname: str
|
|
332
|
+
source_catalog_id: str
|
|
333
|
+
source_snapshot: str | None = None
|
|
334
|
+
source_schema_url: str | None = None # Hatrac URL to source schema JSON
|
|
335
|
+
orphan_strategy: str = "fail"
|
|
336
|
+
truncate_oversized: bool = False
|
|
337
|
+
prune_hidden_fkeys: bool = False
|
|
338
|
+
schema_only: bool = False
|
|
339
|
+
asset_mode: str = "refs"
|
|
340
|
+
exclude_schemas: list[str] = field(default_factory=list)
|
|
341
|
+
exclude_objects: list[str] = field(default_factory=list)
|
|
342
|
+
rows_copied: int = 0
|
|
343
|
+
rows_skipped: int = 0
|
|
344
|
+
truncated_count: int = 0
|
|
345
|
+
orphan_rows_removed: int = 0
|
|
346
|
+
orphan_rows_nullified: int = 0
|
|
347
|
+
fkeys_pruned: int = 0
|
|
348
|
+
|
|
349
|
+
def to_dict(self) -> dict[str, Any]:
|
|
350
|
+
return {
|
|
351
|
+
"source_hostname": self.source_hostname,
|
|
352
|
+
"source_catalog_id": self.source_catalog_id,
|
|
353
|
+
"source_snapshot": self.source_snapshot,
|
|
354
|
+
"source_schema_url": self.source_schema_url,
|
|
355
|
+
"orphan_strategy": self.orphan_strategy,
|
|
356
|
+
"truncate_oversized": self.truncate_oversized,
|
|
357
|
+
"prune_hidden_fkeys": self.prune_hidden_fkeys,
|
|
358
|
+
"schema_only": self.schema_only,
|
|
359
|
+
"asset_mode": self.asset_mode,
|
|
360
|
+
"exclude_schemas": self.exclude_schemas,
|
|
361
|
+
"exclude_objects": self.exclude_objects,
|
|
362
|
+
"rows_copied": self.rows_copied,
|
|
363
|
+
"rows_skipped": self.rows_skipped,
|
|
364
|
+
"truncated_count": self.truncated_count,
|
|
365
|
+
"orphan_rows_removed": self.orphan_rows_removed,
|
|
366
|
+
"orphan_rows_nullified": self.orphan_rows_nullified,
|
|
367
|
+
"fkeys_pruned": self.fkeys_pruned,
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
@classmethod
|
|
371
|
+
def from_dict(cls, data: dict[str, Any]) -> "CloneDetails":
|
|
372
|
+
return cls(
|
|
373
|
+
source_hostname=data.get("source_hostname", ""),
|
|
374
|
+
source_catalog_id=data.get("source_catalog_id", ""),
|
|
375
|
+
source_snapshot=data.get("source_snapshot"),
|
|
376
|
+
source_schema_url=data.get("source_schema_url"),
|
|
377
|
+
orphan_strategy=data.get("orphan_strategy", "fail"),
|
|
378
|
+
truncate_oversized=data.get("truncate_oversized", False),
|
|
379
|
+
prune_hidden_fkeys=data.get("prune_hidden_fkeys", False),
|
|
380
|
+
schema_only=data.get("schema_only", False),
|
|
381
|
+
asset_mode=data.get("asset_mode", "refs"),
|
|
382
|
+
exclude_schemas=data.get("exclude_schemas", []),
|
|
383
|
+
exclude_objects=data.get("exclude_objects", []),
|
|
384
|
+
rows_copied=data.get("rows_copied", 0),
|
|
385
|
+
rows_skipped=data.get("rows_skipped", 0),
|
|
386
|
+
truncated_count=data.get("truncated_count", 0),
|
|
387
|
+
orphan_rows_removed=data.get("orphan_rows_removed", 0),
|
|
388
|
+
orphan_rows_nullified=data.get("orphan_rows_nullified", 0),
|
|
389
|
+
fkeys_pruned=data.get("fkeys_pruned", 0),
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
@dataclass
|
|
394
|
+
class CatalogProvenance:
|
|
395
|
+
"""Provenance information for a catalog.
|
|
396
|
+
|
|
397
|
+
This metadata is stored as a catalog-level annotation and tracks
|
|
398
|
+
how the catalog was created, by whom, and with what parameters.
|
|
399
|
+
Supports both cloned catalogs and catalogs created by other means.
|
|
400
|
+
|
|
401
|
+
Attributes:
|
|
402
|
+
creation_method: How the catalog was created (clone, create, schema, unknown).
|
|
403
|
+
created_at: ISO timestamp when the catalog was created.
|
|
404
|
+
created_by: User or system that created the catalog (Globus identity or description).
|
|
405
|
+
hostname: Hostname where the catalog resides.
|
|
406
|
+
catalog_id: Catalog ID.
|
|
407
|
+
name: Human-readable name for the catalog.
|
|
408
|
+
description: Description of the catalog's purpose.
|
|
409
|
+
workflow_url: URL to the workflow/script that created the catalog (e.g., GitHub URL).
|
|
410
|
+
workflow_version: Version of the workflow (e.g., git commit hash, package version).
|
|
411
|
+
clone_details: If cloned, detailed information about the clone operation.
|
|
412
|
+
"""
|
|
413
|
+
|
|
414
|
+
creation_method: CatalogCreationMethod
|
|
415
|
+
created_at: str
|
|
416
|
+
hostname: str
|
|
417
|
+
catalog_id: str
|
|
418
|
+
created_by: str | None = None
|
|
419
|
+
name: str | None = None
|
|
420
|
+
description: str | None = None
|
|
421
|
+
workflow_url: str | None = None
|
|
422
|
+
workflow_version: str | None = None
|
|
423
|
+
clone_details: CloneDetails | None = None
|
|
424
|
+
|
|
425
|
+
def to_dict(self) -> dict[str, Any]:
|
|
426
|
+
result = {
|
|
427
|
+
"creation_method": self.creation_method.value,
|
|
428
|
+
"created_at": self.created_at,
|
|
429
|
+
"hostname": self.hostname,
|
|
430
|
+
"catalog_id": self.catalog_id,
|
|
431
|
+
"created_by": self.created_by,
|
|
432
|
+
"name": self.name,
|
|
433
|
+
"description": self.description,
|
|
434
|
+
"workflow_url": self.workflow_url,
|
|
435
|
+
"workflow_version": self.workflow_version,
|
|
436
|
+
}
|
|
437
|
+
if self.clone_details:
|
|
438
|
+
result["clone_details"] = self.clone_details.to_dict()
|
|
439
|
+
return result
|
|
440
|
+
|
|
441
|
+
@classmethod
|
|
442
|
+
def from_dict(cls, data: dict[str, Any]) -> "CatalogProvenance":
|
|
443
|
+
clone_details = None
|
|
444
|
+
if data.get("clone_details"):
|
|
445
|
+
clone_details = CloneDetails.from_dict(data["clone_details"])
|
|
446
|
+
|
|
447
|
+
# Handle legacy format where creation_method might be missing
|
|
448
|
+
method_str = data.get("creation_method", "unknown")
|
|
449
|
+
try:
|
|
450
|
+
creation_method = CatalogCreationMethod(method_str)
|
|
451
|
+
except ValueError:
|
|
452
|
+
creation_method = CatalogCreationMethod.UNKNOWN
|
|
453
|
+
|
|
454
|
+
return cls(
|
|
455
|
+
creation_method=creation_method,
|
|
456
|
+
created_at=data.get("created_at", ""),
|
|
457
|
+
hostname=data.get("hostname", ""),
|
|
458
|
+
catalog_id=data.get("catalog_id", ""),
|
|
459
|
+
created_by=data.get("created_by"),
|
|
460
|
+
name=data.get("name"),
|
|
461
|
+
description=data.get("description"),
|
|
462
|
+
workflow_url=data.get("workflow_url"),
|
|
463
|
+
workflow_version=data.get("workflow_version"),
|
|
464
|
+
clone_details=clone_details,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
@property
|
|
468
|
+
def is_clone(self) -> bool:
|
|
469
|
+
"""Return True if this catalog was cloned from another catalog."""
|
|
470
|
+
return self.creation_method == CatalogCreationMethod.CLONE and self.clone_details is not None
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def _upload_source_schema(
|
|
474
|
+
hostname: str,
|
|
475
|
+
catalog_id: str,
|
|
476
|
+
schema_json: dict[str, Any],
|
|
477
|
+
credential: dict | None,
|
|
478
|
+
) -> str | None:
|
|
479
|
+
"""Upload source schema JSON to Hatrac.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
hostname: Destination catalog hostname.
|
|
483
|
+
catalog_id: Destination catalog ID.
|
|
484
|
+
schema_json: The source schema as a dictionary.
|
|
485
|
+
credential: Credential for Hatrac access.
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
Hatrac URL for the uploaded schema, or None if upload failed.
|
|
489
|
+
"""
|
|
490
|
+
try:
|
|
491
|
+
cred = credential or get_credential(hostname)
|
|
492
|
+
hatrac = HatracStore("https", hostname, credentials=cred)
|
|
493
|
+
|
|
494
|
+
# Create namespace for catalog provenance metadata if it doesn't exist
|
|
495
|
+
namespace = f"/hatrac/catalog/{catalog_id}/provenance"
|
|
496
|
+
try:
|
|
497
|
+
hatrac.create_namespace(namespace, parents=True)
|
|
498
|
+
except Exception:
|
|
499
|
+
pass # Namespace may already exist
|
|
500
|
+
|
|
501
|
+
# Upload schema JSON
|
|
502
|
+
schema_bytes = json.dumps(schema_json, indent=2).encode("utf-8")
|
|
503
|
+
object_path = f"{namespace}/source-schema.json"
|
|
504
|
+
|
|
505
|
+
url = hatrac.put_obj(
|
|
506
|
+
object_path,
|
|
507
|
+
schema_bytes,
|
|
508
|
+
content_type="application/json",
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
logger.info(f"Uploaded source schema to {url}")
|
|
512
|
+
return url
|
|
513
|
+
|
|
514
|
+
except Exception as e:
|
|
515
|
+
logger.warning(f"Failed to upload source schema to Hatrac: {e}")
|
|
516
|
+
return None
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def _set_catalog_provenance(
|
|
520
|
+
dst_catalog: ErmrestCatalog,
|
|
521
|
+
provenance: CatalogProvenance,
|
|
522
|
+
) -> None:
|
|
523
|
+
"""Set the catalog provenance annotation on a catalog.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
dst_catalog: Catalog connection.
|
|
527
|
+
provenance: Catalog provenance information.
|
|
528
|
+
"""
|
|
529
|
+
try:
|
|
530
|
+
dst_catalog.put(
|
|
531
|
+
f"/annotation/{urlquote(_catalog_provenance_url)}",
|
|
532
|
+
json=provenance.to_dict(),
|
|
533
|
+
)
|
|
534
|
+
logger.info("Set catalog provenance annotation")
|
|
535
|
+
except Exception as e:
|
|
536
|
+
logger.warning(f"Failed to set catalog provenance annotation: {e}")
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def set_catalog_provenance(
|
|
540
|
+
catalog: ErmrestCatalog,
|
|
541
|
+
name: str | None = None,
|
|
542
|
+
description: str | None = None,
|
|
543
|
+
workflow_url: str | None = None,
|
|
544
|
+
workflow_version: str | None = None,
|
|
545
|
+
creation_method: CatalogCreationMethod = CatalogCreationMethod.CREATE,
|
|
546
|
+
) -> CatalogProvenance:
|
|
547
|
+
"""Set catalog provenance information for a newly created catalog.
|
|
548
|
+
|
|
549
|
+
Use this function when creating a catalog programmatically to record
|
|
550
|
+
how and why it was created. This is similar to workflow metadata but
|
|
551
|
+
at the catalog level.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
catalog: The catalog to annotate.
|
|
555
|
+
name: Human-readable name for the catalog.
|
|
556
|
+
description: Description of the catalog's purpose.
|
|
557
|
+
workflow_url: URL to the workflow/script that created the catalog
|
|
558
|
+
(e.g., GitHub URL, notebook URL).
|
|
559
|
+
workflow_version: Version of the workflow (e.g., git commit hash,
|
|
560
|
+
package version, or semantic version).
|
|
561
|
+
creation_method: How the catalog was created. Defaults to CREATE.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
The CatalogProvenance object that was set.
|
|
565
|
+
|
|
566
|
+
Example:
|
|
567
|
+
>>> from deriva_ml.catalog import set_catalog_provenance, CatalogCreationMethod
|
|
568
|
+
>>> provenance = set_catalog_provenance(
|
|
569
|
+
... catalog,
|
|
570
|
+
... name="CIFAR-10 Training Catalog",
|
|
571
|
+
... description="Catalog for CIFAR-10 image classification experiments",
|
|
572
|
+
... workflow_url="https://github.com/org/repo/blob/main/setup_catalog.py",
|
|
573
|
+
... workflow_version="v1.2.0",
|
|
574
|
+
... )
|
|
575
|
+
"""
|
|
576
|
+
# Try to get current user identity
|
|
577
|
+
created_by = None
|
|
578
|
+
try:
|
|
579
|
+
# Get user info from catalog session
|
|
580
|
+
session_info = catalog.get("/authn/session").json()
|
|
581
|
+
if session_info and "client" in session_info:
|
|
582
|
+
client = session_info["client"]
|
|
583
|
+
created_by = client.get("display_name") or client.get("id")
|
|
584
|
+
except Exception:
|
|
585
|
+
pass
|
|
586
|
+
|
|
587
|
+
# Get catalog info
|
|
588
|
+
try:
|
|
589
|
+
catalog_info = catalog.get("/").json()
|
|
590
|
+
hostname = catalog_info.get("meta", {}).get("host", "")
|
|
591
|
+
catalog_id = str(catalog.catalog_id)
|
|
592
|
+
except Exception:
|
|
593
|
+
hostname = ""
|
|
594
|
+
catalog_id = str(catalog.catalog_id)
|
|
595
|
+
|
|
596
|
+
provenance = CatalogProvenance(
|
|
597
|
+
creation_method=creation_method,
|
|
598
|
+
created_at=datetime.now(timezone.utc).isoformat(),
|
|
599
|
+
hostname=hostname,
|
|
600
|
+
catalog_id=catalog_id,
|
|
601
|
+
created_by=created_by,
|
|
602
|
+
name=name,
|
|
603
|
+
description=description,
|
|
604
|
+
workflow_url=workflow_url,
|
|
605
|
+
workflow_version=workflow_version,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
_set_catalog_provenance(catalog, provenance)
|
|
609
|
+
return provenance
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def get_catalog_provenance(catalog: ErmrestCatalog) -> CatalogProvenance | None:
|
|
613
|
+
"""Get the catalog provenance information.
|
|
614
|
+
|
|
615
|
+
Returns provenance information if the catalog has it set. This includes
|
|
616
|
+
information about how the catalog was created (clone, create, schema),
|
|
617
|
+
who created it, and any workflow information.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
catalog: The catalog to check.
|
|
621
|
+
|
|
622
|
+
Returns:
|
|
623
|
+
CatalogProvenance if available, None otherwise.
|
|
624
|
+
"""
|
|
625
|
+
try:
|
|
626
|
+
model = catalog.getCatalogModel()
|
|
627
|
+
provenance_data = model.annotations.get(_catalog_provenance_url)
|
|
628
|
+
if provenance_data:
|
|
629
|
+
return CatalogProvenance.from_dict(provenance_data)
|
|
630
|
+
except Exception as e:
|
|
631
|
+
logger.debug(f"Could not get catalog provenance: {e}")
|
|
632
|
+
|
|
633
|
+
return None
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _parse_index_error(error_msg: str) -> tuple[str | None, str | None]:
|
|
637
|
+
"""Parse a btree index size error to extract index name and column.
|
|
638
|
+
|
|
639
|
+
Args:
|
|
640
|
+
error_msg: The error message from ERMrest/PostgreSQL.
|
|
641
|
+
|
|
642
|
+
Returns:
|
|
643
|
+
Tuple of (index_name, column_name) if this is an index size error,
|
|
644
|
+
(None, None) otherwise.
|
|
645
|
+
"""
|
|
646
|
+
import re
|
|
647
|
+
|
|
648
|
+
if _BTREE_INDEX_ERROR_PATTERN not in error_msg:
|
|
649
|
+
return None, None
|
|
650
|
+
|
|
651
|
+
# Extract index name from error message
|
|
652
|
+
match = re.search(_BTREE_INDEX_NAME_PATTERN, error_msg)
|
|
653
|
+
if not match:
|
|
654
|
+
return None, None
|
|
655
|
+
|
|
656
|
+
index_name = match.group(1)
|
|
657
|
+
|
|
658
|
+
# Try to extract column name from index name (common pattern: table__column_idx)
|
|
659
|
+
# e.g., "dataset__keywords_idx" -> "keywords"
|
|
660
|
+
if "__" in index_name and index_name.endswith("_idx"):
|
|
661
|
+
parts = index_name.rsplit("__", 1)
|
|
662
|
+
if len(parts) == 2:
|
|
663
|
+
column_name = parts[1].replace("_idx", "")
|
|
664
|
+
return index_name, column_name
|
|
665
|
+
|
|
666
|
+
return index_name, None
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def _copy_table_data_with_retry(
|
|
672
|
+
src_catalog: ErmrestCatalog,
|
|
673
|
+
dst_catalog: ErmrestCatalog,
|
|
674
|
+
sname: str,
|
|
675
|
+
tname: str,
|
|
676
|
+
page_size: int,
|
|
677
|
+
report: "CloneReport",
|
|
678
|
+
deferred_indexes: dict[str, list[dict]],
|
|
679
|
+
truncate_oversized: bool = False,
|
|
680
|
+
) -> tuple[int, int, list[TruncatedValue]]:
|
|
681
|
+
"""Copy data for a single table with retry logic for index errors.
|
|
682
|
+
|
|
683
|
+
If a btree index size error occurs, this function will:
|
|
684
|
+
1. Detect the problematic index and column
|
|
685
|
+
2. Switch to row-by-row insertion mode
|
|
686
|
+
3. Either truncate oversized values (if truncate_oversized=True) or skip rows
|
|
687
|
+
4. Record skipped/truncated rows in the report
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
src_catalog: Source catalog connection.
|
|
691
|
+
dst_catalog: Destination catalog connection.
|
|
692
|
+
sname: Schema name.
|
|
693
|
+
tname: Table name.
|
|
694
|
+
page_size: Number of rows per page.
|
|
695
|
+
report: Clone report for recording issues.
|
|
696
|
+
deferred_indexes: Dict to collect indexes that need rebuilding.
|
|
697
|
+
Key is "schema:table", value is list of index definitions.
|
|
698
|
+
truncate_oversized: If True, truncate oversized values instead of skipping rows.
|
|
699
|
+
|
|
700
|
+
Returns:
|
|
701
|
+
Tuple of (rows_copied, rows_skipped, truncated_values).
|
|
702
|
+
rows_copied is -1 if the copy failed entirely.
|
|
703
|
+
"""
|
|
704
|
+
tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
|
|
705
|
+
table_key = f"{sname}:{tname}"
|
|
706
|
+
|
|
707
|
+
# Maximum safe size for btree index values (with margin below 2704 limit)
|
|
708
|
+
MAX_INDEX_VALUE_BYTES = 2600
|
|
709
|
+
TRUNCATE_SUFFIX = "...[TRUNCATED]"
|
|
710
|
+
|
|
711
|
+
last = None
|
|
712
|
+
table_rows = 0
|
|
713
|
+
rows_skipped = 0
|
|
714
|
+
truncated_values: list[TruncatedValue] = []
|
|
715
|
+
row_by_row_mode = False
|
|
716
|
+
problematic_index = None
|
|
717
|
+
problematic_column = None
|
|
718
|
+
|
|
719
|
+
def truncate_row_values(row: dict, column: str | None) -> tuple[dict, list[TruncatedValue]]:
|
|
720
|
+
"""Truncate oversized text values in a row.
|
|
721
|
+
|
|
722
|
+
Returns the modified row and list of truncation records.
|
|
723
|
+
"""
|
|
724
|
+
truncations = []
|
|
725
|
+
modified_row = row.copy()
|
|
726
|
+
rid = row.get('RID', 'unknown')
|
|
727
|
+
|
|
728
|
+
# If we know the problematic column, only check that one
|
|
729
|
+
columns_to_check = [column] if column else list(row.keys())
|
|
730
|
+
|
|
731
|
+
for col in columns_to_check:
|
|
732
|
+
if col not in modified_row:
|
|
733
|
+
continue
|
|
734
|
+
value = modified_row[col]
|
|
735
|
+
if isinstance(value, str):
|
|
736
|
+
value_bytes = len(value.encode('utf-8'))
|
|
737
|
+
if value_bytes > MAX_INDEX_VALUE_BYTES:
|
|
738
|
+
# Truncate to safe size, accounting for suffix
|
|
739
|
+
max_chars = MAX_INDEX_VALUE_BYTES - len(TRUNCATE_SUFFIX.encode('utf-8'))
|
|
740
|
+
# Be conservative - truncate by character count as approximation
|
|
741
|
+
# since UTF-8 chars can be multi-byte
|
|
742
|
+
truncated = value[:max_chars] + TRUNCATE_SUFFIX
|
|
743
|
+
# Verify the result fits
|
|
744
|
+
while len(truncated.encode('utf-8')) > MAX_INDEX_VALUE_BYTES:
|
|
745
|
+
max_chars -= 100
|
|
746
|
+
truncated = value[:max_chars] + TRUNCATE_SUFFIX
|
|
747
|
+
|
|
748
|
+
modified_row[col] = truncated
|
|
749
|
+
truncations.append(TruncatedValue(
|
|
750
|
+
table=table_key,
|
|
751
|
+
rid=str(rid),
|
|
752
|
+
column=col,
|
|
753
|
+
original_bytes=value_bytes,
|
|
754
|
+
truncated_bytes=len(truncated.encode('utf-8')),
|
|
755
|
+
))
|
|
756
|
+
logger.debug(
|
|
757
|
+
f"Truncated {table_key}.{col} for RID {rid}: "
|
|
758
|
+
f"{value_bytes} -> {len(truncated.encode('utf-8'))} bytes"
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
return modified_row, truncations
|
|
762
|
+
|
|
763
|
+
while True:
|
|
764
|
+
after_clause = f"@after({urlquote(last)})" if last else ""
|
|
765
|
+
try:
|
|
766
|
+
page = src_catalog.get(
|
|
767
|
+
f"/entity/{tname_uri}@sort(RID){after_clause}?limit={page_size}"
|
|
768
|
+
).json()
|
|
769
|
+
except Exception as e:
|
|
770
|
+
logger.warning(f"Failed to read from {sname}:{tname}: {e}")
|
|
771
|
+
return -1, rows_skipped, truncated_values
|
|
772
|
+
|
|
773
|
+
if not page:
|
|
774
|
+
break
|
|
775
|
+
|
|
776
|
+
if row_by_row_mode:
|
|
777
|
+
# Insert rows one at a time, handling oversized values
|
|
778
|
+
for row in page:
|
|
779
|
+
row_to_insert = row
|
|
780
|
+
|
|
781
|
+
# If truncation is enabled, try to truncate first
|
|
782
|
+
if truncate_oversized and problematic_column:
|
|
783
|
+
row_to_insert, truncations = truncate_row_values(row, problematic_column)
|
|
784
|
+
truncated_values.extend(truncations)
|
|
785
|
+
|
|
786
|
+
try:
|
|
787
|
+
dst_catalog.post(
|
|
788
|
+
f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
|
|
789
|
+
json=[row_to_insert]
|
|
790
|
+
)
|
|
791
|
+
table_rows += 1
|
|
792
|
+
except Exception as row_error:
|
|
793
|
+
error_msg = str(row_error)
|
|
794
|
+
if _BTREE_INDEX_ERROR_PATTERN in error_msg:
|
|
795
|
+
# This row has a value too large for the index
|
|
796
|
+
if truncate_oversized:
|
|
797
|
+
# Try truncating all text columns
|
|
798
|
+
row_to_insert, truncations = truncate_row_values(row, None)
|
|
799
|
+
truncated_values.extend(truncations)
|
|
800
|
+
try:
|
|
801
|
+
dst_catalog.post(
|
|
802
|
+
f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
|
|
803
|
+
json=[row_to_insert]
|
|
804
|
+
)
|
|
805
|
+
table_rows += 1
|
|
806
|
+
continue
|
|
807
|
+
except Exception:
|
|
808
|
+
pass # Fall through to skip
|
|
809
|
+
|
|
810
|
+
rows_skipped += 1
|
|
811
|
+
rid = row.get('RID', 'unknown')
|
|
812
|
+
logger.debug(f"Skipping row {rid} in {table_key} due to index size limit")
|
|
813
|
+
else:
|
|
814
|
+
# Different error - log and skip
|
|
815
|
+
rows_skipped += 1
|
|
816
|
+
logger.debug(f"Skipping row in {table_key}: {row_error}")
|
|
817
|
+
last = page[-1]['RID']
|
|
818
|
+
else:
|
|
819
|
+
# Normal batch mode
|
|
820
|
+
try:
|
|
821
|
+
dst_catalog.post(
|
|
822
|
+
f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
|
|
823
|
+
json=page
|
|
824
|
+
)
|
|
825
|
+
last = page[-1]['RID']
|
|
826
|
+
table_rows += len(page)
|
|
827
|
+
except Exception as e:
|
|
828
|
+
error_msg = str(e)
|
|
829
|
+
|
|
830
|
+
# Check if this is a btree index size error
|
|
831
|
+
index_name, column_name = _parse_index_error(error_msg)
|
|
832
|
+
|
|
833
|
+
if index_name:
|
|
834
|
+
action_desc = "Values will be truncated" if truncate_oversized else "Rows with oversized values will be skipped"
|
|
835
|
+
logger.info(
|
|
836
|
+
f"Detected btree index size error for '{index_name}' on {table_key}. "
|
|
837
|
+
f"Switching to row-by-row mode. {action_desc}."
|
|
838
|
+
)
|
|
839
|
+
problematic_index = index_name
|
|
840
|
+
problematic_column = column_name
|
|
841
|
+
row_by_row_mode = True
|
|
842
|
+
|
|
843
|
+
# Record the issue
|
|
844
|
+
report.add_issue(CloneIssue(
|
|
845
|
+
severity=CloneIssueSeverity.WARNING,
|
|
846
|
+
category=CloneIssueCategory.INDEX_REBUILT,
|
|
847
|
+
message=f"Index '{index_name}' has oversized values, using row-by-row mode",
|
|
848
|
+
table=table_key,
|
|
849
|
+
details=f"Column '{column_name}' has values exceeding btree 2704 byte limit",
|
|
850
|
+
action=action_desc,
|
|
851
|
+
))
|
|
852
|
+
|
|
853
|
+
# Retry this page in row-by-row mode
|
|
854
|
+
for row in page:
|
|
855
|
+
row_to_insert = row
|
|
856
|
+
|
|
857
|
+
# If truncation is enabled, try to truncate first
|
|
858
|
+
if truncate_oversized and problematic_column:
|
|
859
|
+
row_to_insert, truncations = truncate_row_values(row, problematic_column)
|
|
860
|
+
truncated_values.extend(truncations)
|
|
861
|
+
|
|
862
|
+
try:
|
|
863
|
+
dst_catalog.post(
|
|
864
|
+
f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
|
|
865
|
+
json=[row_to_insert]
|
|
866
|
+
)
|
|
867
|
+
table_rows += 1
|
|
868
|
+
except Exception as row_error:
|
|
869
|
+
error_msg_row = str(row_error)
|
|
870
|
+
if _BTREE_INDEX_ERROR_PATTERN in error_msg_row:
|
|
871
|
+
# Try truncating all columns if not already done
|
|
872
|
+
if truncate_oversized:
|
|
873
|
+
row_to_insert, truncations = truncate_row_values(row, None)
|
|
874
|
+
truncated_values.extend(truncations)
|
|
875
|
+
try:
|
|
876
|
+
dst_catalog.post(
|
|
877
|
+
f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
|
|
878
|
+
json=[row_to_insert]
|
|
879
|
+
)
|
|
880
|
+
table_rows += 1
|
|
881
|
+
continue
|
|
882
|
+
except Exception:
|
|
883
|
+
pass # Fall through to skip
|
|
884
|
+
|
|
885
|
+
rows_skipped += 1
|
|
886
|
+
rid = row.get('RID', 'unknown')
|
|
887
|
+
logger.debug(f"Skipping row {rid} due to index size limit")
|
|
888
|
+
else:
|
|
889
|
+
rows_skipped += 1
|
|
890
|
+
logger.debug(f"Skipping row: {row_error}")
|
|
891
|
+
last = page[-1]['RID']
|
|
892
|
+
else:
|
|
893
|
+
logger.warning(f"Failed to write to {sname}:{tname}: {e}")
|
|
894
|
+
return -1, rows_skipped, truncated_values
|
|
895
|
+
|
|
896
|
+
# Report skipped rows
|
|
897
|
+
if rows_skipped > 0:
|
|
898
|
+
report.add_issue(CloneIssue(
|
|
899
|
+
severity=CloneIssueSeverity.WARNING,
|
|
900
|
+
category=CloneIssueCategory.DATA_INTEGRITY,
|
|
901
|
+
message=f"Skipped {rows_skipped} rows due to index size limits",
|
|
902
|
+
table=table_key,
|
|
903
|
+
details=f"Index '{problematic_index}' on column '{problematic_column}'",
|
|
904
|
+
action="These rows have values too large for btree index (>2704 bytes)",
|
|
905
|
+
row_count=rows_skipped,
|
|
906
|
+
))
|
|
907
|
+
logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits")
|
|
908
|
+
|
|
909
|
+
# Report truncated values
|
|
910
|
+
if truncated_values:
|
|
911
|
+
report.add_issue(CloneIssue(
|
|
912
|
+
severity=CloneIssueSeverity.INFO,
|
|
913
|
+
category=CloneIssueCategory.DATA_INTEGRITY,
|
|
914
|
+
message=f"Truncated {len(truncated_values)} values to fit index size limits",
|
|
915
|
+
table=table_key,
|
|
916
|
+
details=f"Values in column '{problematic_column}' were truncated to <{MAX_INDEX_VALUE_BYTES} bytes",
|
|
917
|
+
action="Original data was preserved with '[TRUNCATED]' suffix",
|
|
918
|
+
row_count=len(truncated_values),
|
|
919
|
+
))
|
|
920
|
+
logger.info(f"Truncated {len(truncated_values)} values in {table_key}")
|
|
921
|
+
|
|
922
|
+
return table_rows, rows_skipped, truncated_values
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def _rebuild_deferred_indexes(
|
|
928
|
+
dst_catalog: ErmrestCatalog,
|
|
929
|
+
deferred_indexes: dict[str, list[dict]],
|
|
930
|
+
report: "CloneReport",
|
|
931
|
+
) -> None:
|
|
932
|
+
"""Note any indexes that had issues during data copy.
|
|
933
|
+
|
|
934
|
+
This function is called after data copy to report on any index-related
|
|
935
|
+
issues that were encountered. Since ERMrest doesn't provide direct index
|
|
936
|
+
management, we can only report these issues for manual follow-up.
|
|
937
|
+
|
|
938
|
+
Args:
|
|
939
|
+
dst_catalog: Destination catalog.
|
|
940
|
+
deferred_indexes: Dict of table -> list of index definitions with issues.
|
|
941
|
+
report: Clone report.
|
|
942
|
+
"""
|
|
943
|
+
if not deferred_indexes:
|
|
944
|
+
return
|
|
945
|
+
|
|
946
|
+
logger.info(f"Reporting {sum(len(v) for v in deferred_indexes.values())} index issues...")
|
|
947
|
+
|
|
284
948
|
|
|
285
949
|
def clone_catalog(
|
|
286
950
|
source_hostname: str,
|
|
@@ -300,6 +964,7 @@ def clone_catalog(
|
|
|
300
964
|
reinitialize_dataset_versions: bool = True,
|
|
301
965
|
orphan_strategy: OrphanStrategy = OrphanStrategy.FAIL,
|
|
302
966
|
prune_hidden_fkeys: bool = False,
|
|
967
|
+
truncate_oversized: bool = False,
|
|
303
968
|
) -> CloneCatalogResult:
|
|
304
969
|
"""Clone a catalog with robust handling of policy-induced FK violations.
|
|
305
970
|
|
|
@@ -336,9 +1001,18 @@ def clone_catalog(
|
|
|
336
1001
|
prune_hidden_fkeys: If True, skip FKs where referenced columns have
|
|
337
1002
|
"select": null rights (indicating potentially hidden data). This
|
|
338
1003
|
prevents FK violations but degrades schema structure.
|
|
1004
|
+
truncate_oversized: If True, automatically truncate text values that
|
|
1005
|
+
exceed PostgreSQL's btree index size limit (2704 bytes). Truncated
|
|
1006
|
+
values will have "...[TRUNCATED]" appended. If False (default),
|
|
1007
|
+
rows with oversized values are skipped. All truncations are recorded
|
|
1008
|
+
in the result's truncated_values list.
|
|
339
1009
|
|
|
340
1010
|
Returns:
|
|
341
|
-
CloneCatalogResult with details of the cloned catalog
|
|
1011
|
+
CloneCatalogResult with details of the cloned catalog, including:
|
|
1012
|
+
- truncated_values: List of TruncatedValue records for any values
|
|
1013
|
+
that were truncated due to index size limits.
|
|
1014
|
+
- rows_skipped: Count of rows skipped due to index size limits
|
|
1015
|
+
(when truncate_oversized=False).
|
|
342
1016
|
|
|
343
1017
|
Raises:
|
|
344
1018
|
ValueError: If invalid parameters or FK violations with FAIL strategy.
|
|
@@ -372,6 +1046,9 @@ def clone_catalog(
|
|
|
372
1046
|
src_server = DerivaServer("https", source_hostname, credentials=src_cred)
|
|
373
1047
|
src_catalog = src_server.connect_ermrest(source_catalog_id)
|
|
374
1048
|
|
|
1049
|
+
# Capture source schema for provenance before any modifications
|
|
1050
|
+
source_schema_json = src_catalog.get("/schema").json()
|
|
1051
|
+
|
|
375
1052
|
# Connect to destination and create new catalog
|
|
376
1053
|
if is_same_server:
|
|
377
1054
|
dst_cred = src_cred
|
|
@@ -387,8 +1064,15 @@ def clone_catalog(
|
|
|
387
1064
|
|
|
388
1065
|
report = CloneReport()
|
|
389
1066
|
|
|
1067
|
+
# Track truncated values
|
|
1068
|
+
truncated_values: list[TruncatedValue] = []
|
|
1069
|
+
rows_skipped = 0
|
|
1070
|
+
|
|
1071
|
+
# Record clone timestamp
|
|
1072
|
+
clone_timestamp = datetime.now(timezone.utc).isoformat()
|
|
1073
|
+
|
|
390
1074
|
# Perform the three-stage clone
|
|
391
|
-
orphan_rows_removed, orphan_rows_nullified, fkeys_pruned = _clone_three_stage(
|
|
1075
|
+
orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values = _clone_three_stage(
|
|
392
1076
|
src_catalog=src_catalog,
|
|
393
1077
|
dst_catalog=dst_catalog,
|
|
394
1078
|
copy_data=not schema_only,
|
|
@@ -398,6 +1082,7 @@ def clone_catalog(
|
|
|
398
1082
|
exclude_objects=exclude_objects or [],
|
|
399
1083
|
orphan_strategy=orphan_strategy,
|
|
400
1084
|
prune_hidden_fkeys=prune_hidden_fkeys,
|
|
1085
|
+
truncate_oversized=truncate_oversized,
|
|
401
1086
|
report=report,
|
|
402
1087
|
)
|
|
403
1088
|
|
|
@@ -412,9 +1097,66 @@ def clone_catalog(
|
|
|
412
1097
|
orphan_rows_removed=orphan_rows_removed,
|
|
413
1098
|
orphan_rows_nullified=orphan_rows_nullified,
|
|
414
1099
|
fkeys_pruned=fkeys_pruned,
|
|
1100
|
+
rows_skipped=rows_skipped,
|
|
1101
|
+
truncated_values=truncated_values,
|
|
415
1102
|
report=report,
|
|
416
1103
|
)
|
|
417
1104
|
|
|
1105
|
+
# Upload source schema to Hatrac and set catalog provenance
|
|
1106
|
+
source_schema_url = _upload_source_schema(
|
|
1107
|
+
hostname=effective_dest_hostname,
|
|
1108
|
+
catalog_id=result.catalog_id,
|
|
1109
|
+
schema_json=source_schema_json,
|
|
1110
|
+
credential=dst_cred,
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
# Calculate total rows copied from report
|
|
1114
|
+
total_rows_copied = sum(report.tables_restored.values())
|
|
1115
|
+
|
|
1116
|
+
# Try to get current user identity
|
|
1117
|
+
created_by = None
|
|
1118
|
+
try:
|
|
1119
|
+
session_info = dst_catalog.get("/authn/session").json()
|
|
1120
|
+
if session_info and "client" in session_info:
|
|
1121
|
+
client = session_info["client"]
|
|
1122
|
+
created_by = client.get("display_name") or client.get("id")
|
|
1123
|
+
except Exception:
|
|
1124
|
+
pass
|
|
1125
|
+
|
|
1126
|
+
# Create clone details
|
|
1127
|
+
clone_details = CloneDetails(
|
|
1128
|
+
source_hostname=source_hostname,
|
|
1129
|
+
source_catalog_id=source_catalog_id,
|
|
1130
|
+
source_snapshot=source_snapshot,
|
|
1131
|
+
source_schema_url=source_schema_url,
|
|
1132
|
+
orphan_strategy=orphan_strategy.value,
|
|
1133
|
+
truncate_oversized=truncate_oversized,
|
|
1134
|
+
prune_hidden_fkeys=prune_hidden_fkeys,
|
|
1135
|
+
schema_only=schema_only,
|
|
1136
|
+
asset_mode=asset_mode.value,
|
|
1137
|
+
exclude_schemas=exclude_schemas or [],
|
|
1138
|
+
exclude_objects=exclude_objects or [],
|
|
1139
|
+
rows_copied=total_rows_copied,
|
|
1140
|
+
rows_skipped=rows_skipped,
|
|
1141
|
+
truncated_count=len(truncated_values),
|
|
1142
|
+
orphan_rows_removed=orphan_rows_removed,
|
|
1143
|
+
orphan_rows_nullified=orphan_rows_nullified,
|
|
1144
|
+
fkeys_pruned=fkeys_pruned,
|
|
1145
|
+
)
|
|
1146
|
+
|
|
1147
|
+
# Create and set catalog provenance annotation
|
|
1148
|
+
provenance = CatalogProvenance(
|
|
1149
|
+
creation_method=CatalogCreationMethod.CLONE,
|
|
1150
|
+
created_at=clone_timestamp,
|
|
1151
|
+
hostname=effective_dest_hostname,
|
|
1152
|
+
catalog_id=result.catalog_id,
|
|
1153
|
+
created_by=created_by,
|
|
1154
|
+
name=alias or f"Clone of {source_catalog_id}",
|
|
1155
|
+
description=f"Cloned from {source_hostname}:{source_catalog_id}",
|
|
1156
|
+
clone_details=clone_details,
|
|
1157
|
+
)
|
|
1158
|
+
_set_catalog_provenance(dst_catalog, provenance)
|
|
1159
|
+
|
|
418
1160
|
# Post-clone operations
|
|
419
1161
|
result = _post_clone_operations(
|
|
420
1162
|
result=result,
|
|
@@ -442,11 +1184,12 @@ def _clone_three_stage(
|
|
|
442
1184
|
exclude_objects: list[str],
|
|
443
1185
|
orphan_strategy: OrphanStrategy,
|
|
444
1186
|
prune_hidden_fkeys: bool,
|
|
1187
|
+
truncate_oversized: bool,
|
|
445
1188
|
report: CloneReport,
|
|
446
|
-
) -> tuple[int, int, int]:
|
|
1189
|
+
) -> tuple[int, int, int, int, list[TruncatedValue]]:
|
|
447
1190
|
"""Perform three-stage catalog cloning.
|
|
448
1191
|
|
|
449
|
-
Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned)
|
|
1192
|
+
Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values)
|
|
450
1193
|
"""
|
|
451
1194
|
src_model = src_catalog.getCatalogModel()
|
|
452
1195
|
|
|
@@ -584,6 +1327,10 @@ def _clone_three_stage(
|
|
|
584
1327
|
|
|
585
1328
|
# Stage 2: Copy data
|
|
586
1329
|
total_rows = 0
|
|
1330
|
+
total_rows_skipped = 0
|
|
1331
|
+
all_truncated_values: list[TruncatedValue] = []
|
|
1332
|
+
deferred_indexes: dict[str, list[dict]] = {} # Track indexes dropped for later rebuild
|
|
1333
|
+
|
|
587
1334
|
if copy_data:
|
|
588
1335
|
logger.info("Stage 2: Copying data...")
|
|
589
1336
|
page_size = 10000
|
|
@@ -592,40 +1339,29 @@ def _clone_three_stage(
|
|
|
592
1339
|
if state != 1:
|
|
593
1340
|
continue
|
|
594
1341
|
|
|
595
|
-
|
|
596
|
-
logger.debug(f"Copying data for {
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
table_rows =
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
report.tables_failed.append(f"{sname}:{tname}")
|
|
610
|
-
break
|
|
1342
|
+
table_key = f"{sname}:{tname}"
|
|
1343
|
+
logger.debug(f"Copying data for {table_key}")
|
|
1344
|
+
|
|
1345
|
+
# Use the new copy function with index error handling
|
|
1346
|
+
table_rows, rows_skipped, truncated = _copy_table_data_with_retry(
|
|
1347
|
+
src_catalog=src_catalog,
|
|
1348
|
+
dst_catalog=dst_catalog,
|
|
1349
|
+
sname=sname,
|
|
1350
|
+
tname=tname,
|
|
1351
|
+
page_size=page_size,
|
|
1352
|
+
report=report,
|
|
1353
|
+
deferred_indexes=deferred_indexes,
|
|
1354
|
+
truncate_oversized=truncate_oversized,
|
|
1355
|
+
)
|
|
611
1356
|
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
dst_catalog.post(
|
|
615
|
-
f"/entity/{tname_uri}?nondefaults=RID,RCT,RCB",
|
|
616
|
-
json=page
|
|
617
|
-
)
|
|
618
|
-
last = page[-1]['RID']
|
|
619
|
-
table_rows += len(page)
|
|
620
|
-
except Exception as e:
|
|
621
|
-
logger.warning(f"Failed to write to {sname}:{tname}: {e}")
|
|
622
|
-
report.tables_failed.append(f"{sname}:{tname}")
|
|
623
|
-
break
|
|
624
|
-
else:
|
|
625
|
-
break
|
|
1357
|
+
total_rows_skipped += rows_skipped
|
|
1358
|
+
all_truncated_values.extend(truncated)
|
|
626
1359
|
|
|
627
|
-
if
|
|
628
|
-
|
|
1360
|
+
if table_rows < 0:
|
|
1361
|
+
# Copy failed
|
|
1362
|
+
report.tables_failed.append(table_key)
|
|
1363
|
+
else:
|
|
1364
|
+
report.tables_restored[table_key] = table_rows
|
|
629
1365
|
total_rows += table_rows
|
|
630
1366
|
|
|
631
1367
|
# Mark complete
|
|
@@ -639,6 +1375,10 @@ def _clone_three_stage(
|
|
|
639
1375
|
|
|
640
1376
|
logger.info(f"Stage 2 complete: {total_rows} rows copied")
|
|
641
1377
|
|
|
1378
|
+
# Rebuild any indexes that were dropped during data copy
|
|
1379
|
+
if deferred_indexes:
|
|
1380
|
+
_rebuild_deferred_indexes(dst_catalog, deferred_indexes, report)
|
|
1381
|
+
|
|
642
1382
|
# Stage 3: Apply foreign keys
|
|
643
1383
|
logger.info("Stage 3: Applying foreign keys...")
|
|
644
1384
|
orphan_rows_removed = 0
|
|
@@ -841,7 +1581,7 @@ def _clone_three_stage(
|
|
|
841
1581
|
if copy_annotations or copy_policy:
|
|
842
1582
|
_copy_configuration(src_model, dst_catalog, copy_annotations, copy_policy, exclude_schemas, excluded_tables)
|
|
843
1583
|
|
|
844
|
-
return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned
|
|
1584
|
+
return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_truncated_values
|
|
845
1585
|
|
|
846
1586
|
|
|
847
1587
|
def _identify_orphan_values(
|
deriva_ml/core/base.py
CHANGED
|
@@ -71,6 +71,7 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
71
71
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
72
72
|
|
|
73
73
|
if TYPE_CHECKING:
|
|
74
|
+
from deriva_ml.catalog.clone import CatalogProvenance
|
|
74
75
|
from deriva_ml.execution.execution import Execution
|
|
75
76
|
from deriva_ml.model.catalog import DerivaModel
|
|
76
77
|
|
|
@@ -479,6 +480,33 @@ class DerivaML(
|
|
|
479
480
|
except DerivaMLException as _e:
|
|
480
481
|
raise DerivaMLException("Entity RID does not exist")
|
|
481
482
|
|
|
483
|
+
@property
|
|
484
|
+
def catalog_provenance(self) -> "CatalogProvenance | None":
|
|
485
|
+
"""Get the provenance information for this catalog.
|
|
486
|
+
|
|
487
|
+
Returns provenance information if the catalog has it set. This includes
|
|
488
|
+
information about how the catalog was created (clone, create, schema),
|
|
489
|
+
who created it, when, and any workflow information.
|
|
490
|
+
|
|
491
|
+
For cloned catalogs, additional details about the clone operation are
|
|
492
|
+
available in the `clone_details` attribute.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
CatalogProvenance if available, None otherwise.
|
|
496
|
+
|
|
497
|
+
Example:
|
|
498
|
+
>>> ml = DerivaML('localhost', '45')
|
|
499
|
+
>>> prov = ml.catalog_provenance
|
|
500
|
+
>>> if prov:
|
|
501
|
+
... print(f"Created: {prov.created_at} by {prov.created_by}")
|
|
502
|
+
... print(f"Method: {prov.creation_method.value}")
|
|
503
|
+
... if prov.is_clone:
|
|
504
|
+
... print(f"Cloned from: {prov.clone_details.source_hostname}")
|
|
505
|
+
"""
|
|
506
|
+
from deriva_ml.catalog.clone import get_catalog_provenance
|
|
507
|
+
|
|
508
|
+
return get_catalog_provenance(self.catalog)
|
|
509
|
+
|
|
482
510
|
def user_list(self) -> List[Dict[str, str]]:
|
|
483
511
|
"""Returns catalog user list.
|
|
484
512
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
deriva_ml/.DS_Store,sha256=gb-f5IXVed_gS5Be1Z6WxCYjrI_r5SdblvfFpIOY4ro,8196
|
|
2
|
-
deriva_ml/__init__.py,sha256=
|
|
2
|
+
deriva_ml/__init__.py,sha256=a7mk8HCe7i3SUGPRPm5ECvZwAYHswAEwgeGD6CBIAEk,4152
|
|
3
3
|
deriva_ml/bump_version.py,sha256=DrVJA8AKqvwQ8Cc-omBLOjfDcKdBxUXm_XNj11SIJZo,11905
|
|
4
4
|
deriva_ml/demo_catalog.py,sha256=_gQVeZm38dHmd2EThhfvCWRPt1meSuNXerkRULRO87U,17760
|
|
5
5
|
deriva_ml/feature.py,sha256=Ap0cIK0kElAEfvlbfYtrWB23NJgy8St6Okhz-nDEZqY,8661
|
|
@@ -10,11 +10,11 @@ deriva_ml/run_notebook.py,sha256=BUShaMlzExfcjerm54en_zow2rcQFK6L0eHX-wwt_cg,277
|
|
|
10
10
|
deriva_ml/asset/__init__.py,sha256=YuV0rFEL0kMDzB8W-qWiUs6HahEadiaYWuS-d3OcoMw,445
|
|
11
11
|
deriva_ml/asset/asset.py,sha256=A8938V8iVufOzk5HdDxm5If1OkaLX1YJqQw-K-Um2rI,13489
|
|
12
12
|
deriva_ml/asset/aux_classes.py,sha256=QIH_pd3koIG04fb-gzHVgdKtykfVgDGJH3F7RN3-dwg,3486
|
|
13
|
-
deriva_ml/catalog/__init__.py,sha256=
|
|
14
|
-
deriva_ml/catalog/clone.py,sha256=
|
|
13
|
+
deriva_ml/catalog/__init__.py,sha256=WzAPL8EGtdVRliIsRe0RyTIivkDvlwpcn718liKlpsU,658
|
|
14
|
+
deriva_ml/catalog/clone.py,sha256=PQpxtampevAo7xIF1MMORWMT-QEG6X_ubJ0VdpU0rSY,74577
|
|
15
15
|
deriva_ml/catalog/localize.py,sha256=-YNvB_dYo0RjoI-VDj2Yu_qFB8TeAFPHfOJTYMTMYF8,14981
|
|
16
16
|
deriva_ml/core/__init__.py,sha256=oqWgo4ckyAfebeXBQXJ9O8ans81tbmzPRnsVHLeVXT8,2000
|
|
17
|
-
deriva_ml/core/base.py,sha256=
|
|
17
|
+
deriva_ml/core/base.py,sha256=THdHOrTp7Rk0DxyzHW4PildQixn8Z-mqP1jCWMMgtxY,57135
|
|
18
18
|
deriva_ml/core/config.py,sha256=2RjpJrzdXC1JlrDGozWbtW_0YAbOf7eyHHr-E0xTozw,9681
|
|
19
19
|
deriva_ml/core/constants.py,sha256=dlS3Wa7Tmmh2JVhhCJjN5Wltu0bJB5rMOSChJ1bdhRA,5300
|
|
20
20
|
deriva_ml/core/definitions.py,sha256=EPGTtUT0cBuss4sZRY-0mQHab9GqBZYdc4ozbxFqC4o,5578
|
|
@@ -69,9 +69,9 @@ deriva_ml/schema/deriva-ml-reference.json,sha256=AEOMIgwKO3dNMMWHb0lxaXyamvfAEbU
|
|
|
69
69
|
deriva_ml/schema/policy.json,sha256=5ykB8nnZFl-oCHzlAwppCFKJHWJFIkYognUMVEanfY8,1826
|
|
70
70
|
deriva_ml/schema/table_comments_utils.py,sha256=4flCqnZAaqg_uSZ9I18pNUWAZoLfmMCXbmI5uERY5vM,2007
|
|
71
71
|
deriva_ml/schema/validation.py,sha256=C0TvWj2kjOj40w1N5FIWp55DWPdLPN8tk3JJfN5ezW4,19912
|
|
72
|
-
deriva_ml-1.17.
|
|
73
|
-
deriva_ml-1.17.
|
|
74
|
-
deriva_ml-1.17.
|
|
75
|
-
deriva_ml-1.17.
|
|
76
|
-
deriva_ml-1.17.
|
|
77
|
-
deriva_ml-1.17.
|
|
72
|
+
deriva_ml-1.17.12.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
73
|
+
deriva_ml-1.17.12.dist-info/METADATA,sha256=pVWfVxpnSawna5gOvGBni0SgWD7oAUKzJsh1APsDba0,1216
|
|
74
|
+
deriva_ml-1.17.12.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
75
|
+
deriva_ml-1.17.12.dist-info/entry_points.txt,sha256=nwRBpDI6yGUMhvEJG__O0LHz6JovazaVXhykvSNF4og,554
|
|
76
|
+
deriva_ml-1.17.12.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
77
|
+
deriva_ml-1.17.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|