deriva-ml 1.17.13__py3-none-any.whl → 1.17.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/catalog/clone.py +88 -18
- {deriva_ml-1.17.13.dist-info → deriva_ml-1.17.15.dist-info}/METADATA +1 -1
- {deriva_ml-1.17.13.dist-info → deriva_ml-1.17.15.dist-info}/RECORD +7 -7
- {deriva_ml-1.17.13.dist-info → deriva_ml-1.17.15.dist-info}/WHEEL +0 -0
- {deriva_ml-1.17.13.dist-info → deriva_ml-1.17.15.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.17.13.dist-info → deriva_ml-1.17.15.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.13.dist-info → deriva_ml-1.17.15.dist-info}/top_level.txt +0 -0
deriva_ml/catalog/clone.py
CHANGED
|
@@ -87,9 +87,10 @@ class CloneIssue:
|
|
|
87
87
|
details: str | None = None
|
|
88
88
|
action: str | None = None
|
|
89
89
|
row_count: int = 0
|
|
90
|
+
skipped_rids: list[str] | None = None # RIDs of rows that were skipped
|
|
90
91
|
|
|
91
92
|
def to_dict(self) -> dict[str, Any]:
|
|
92
|
-
|
|
93
|
+
result = {
|
|
93
94
|
"severity": self.severity.value,
|
|
94
95
|
"category": self.category.value,
|
|
95
96
|
"message": self.message,
|
|
@@ -98,6 +99,9 @@ class CloneIssue:
|
|
|
98
99
|
"action": self.action,
|
|
99
100
|
"row_count": self.row_count,
|
|
100
101
|
}
|
|
102
|
+
if self.skipped_rids:
|
|
103
|
+
result["skipped_rids"] = self.skipped_rids
|
|
104
|
+
return result
|
|
101
105
|
|
|
102
106
|
def __str__(self) -> str:
|
|
103
107
|
parts = [f"[{self.severity.value.upper()}]"]
|
|
@@ -106,7 +110,14 @@ class CloneIssue:
|
|
|
106
110
|
parts.append(self.message)
|
|
107
111
|
if self.row_count > 0:
|
|
108
112
|
parts.append(f"({self.row_count} rows)")
|
|
109
|
-
|
|
113
|
+
result = " ".join(parts)
|
|
114
|
+
if self.skipped_rids:
|
|
115
|
+
# For small numbers, list the RIDs; for large numbers, just show count
|
|
116
|
+
if len(self.skipped_rids) <= 5:
|
|
117
|
+
result += f"\n Skipped RIDs: {', '.join(self.skipped_rids)}"
|
|
118
|
+
else:
|
|
119
|
+
result += f"\n Skipped RIDs: {len(self.skipped_rids)} rows (see JSON for full list)"
|
|
120
|
+
return result
|
|
110
121
|
|
|
111
122
|
|
|
112
123
|
@dataclass
|
|
@@ -332,6 +343,7 @@ class CloneDetails:
|
|
|
332
343
|
source_catalog_id: str
|
|
333
344
|
source_snapshot: str | None = None
|
|
334
345
|
source_schema_url: str | None = None # Hatrac URL to source schema JSON
|
|
346
|
+
# Clone parameters
|
|
335
347
|
orphan_strategy: str = "fail"
|
|
336
348
|
truncate_oversized: bool = False
|
|
337
349
|
prune_hidden_fkeys: bool = False
|
|
@@ -339,15 +351,21 @@ class CloneDetails:
|
|
|
339
351
|
asset_mode: str = "refs"
|
|
340
352
|
exclude_schemas: list[str] = field(default_factory=list)
|
|
341
353
|
exclude_objects: list[str] = field(default_factory=list)
|
|
354
|
+
add_ml_schema: bool = False
|
|
355
|
+
copy_annotations: bool = True
|
|
356
|
+
copy_policy: bool = True
|
|
357
|
+
reinitialize_dataset_versions: bool = True
|
|
358
|
+
# Statistics
|
|
342
359
|
rows_copied: int = 0
|
|
343
360
|
rows_skipped: int = 0
|
|
361
|
+
skipped_rids: list[str] = field(default_factory=list) # RIDs of skipped rows
|
|
344
362
|
truncated_count: int = 0
|
|
345
363
|
orphan_rows_removed: int = 0
|
|
346
364
|
orphan_rows_nullified: int = 0
|
|
347
365
|
fkeys_pruned: int = 0
|
|
348
366
|
|
|
349
367
|
def to_dict(self) -> dict[str, Any]:
|
|
350
|
-
|
|
368
|
+
result = {
|
|
351
369
|
"source_hostname": self.source_hostname,
|
|
352
370
|
"source_catalog_id": self.source_catalog_id,
|
|
353
371
|
"source_snapshot": self.source_snapshot,
|
|
@@ -359,6 +377,10 @@ class CloneDetails:
|
|
|
359
377
|
"asset_mode": self.asset_mode,
|
|
360
378
|
"exclude_schemas": self.exclude_schemas,
|
|
361
379
|
"exclude_objects": self.exclude_objects,
|
|
380
|
+
"add_ml_schema": self.add_ml_schema,
|
|
381
|
+
"copy_annotations": self.copy_annotations,
|
|
382
|
+
"copy_policy": self.copy_policy,
|
|
383
|
+
"reinitialize_dataset_versions": self.reinitialize_dataset_versions,
|
|
362
384
|
"rows_copied": self.rows_copied,
|
|
363
385
|
"rows_skipped": self.rows_skipped,
|
|
364
386
|
"truncated_count": self.truncated_count,
|
|
@@ -366,6 +388,9 @@ class CloneDetails:
|
|
|
366
388
|
"orphan_rows_nullified": self.orphan_rows_nullified,
|
|
367
389
|
"fkeys_pruned": self.fkeys_pruned,
|
|
368
390
|
}
|
|
391
|
+
if self.skipped_rids:
|
|
392
|
+
result["skipped_rids"] = self.skipped_rids
|
|
393
|
+
return result
|
|
369
394
|
|
|
370
395
|
@classmethod
|
|
371
396
|
def from_dict(cls, data: dict[str, Any]) -> "CloneDetails":
|
|
@@ -381,8 +406,13 @@ class CloneDetails:
|
|
|
381
406
|
asset_mode=data.get("asset_mode", "refs"),
|
|
382
407
|
exclude_schemas=data.get("exclude_schemas", []),
|
|
383
408
|
exclude_objects=data.get("exclude_objects", []),
|
|
409
|
+
add_ml_schema=data.get("add_ml_schema", False),
|
|
410
|
+
copy_annotations=data.get("copy_annotations", True),
|
|
411
|
+
copy_policy=data.get("copy_policy", True),
|
|
412
|
+
reinitialize_dataset_versions=data.get("reinitialize_dataset_versions", True),
|
|
384
413
|
rows_copied=data.get("rows_copied", 0),
|
|
385
414
|
rows_skipped=data.get("rows_skipped", 0),
|
|
415
|
+
skipped_rids=data.get("skipped_rids", []),
|
|
386
416
|
truncated_count=data.get("truncated_count", 0),
|
|
387
417
|
orphan_rows_removed=data.get("orphan_rows_removed", 0),
|
|
388
418
|
orphan_rows_nullified=data.get("orphan_rows_nullified", 0),
|
|
@@ -677,7 +707,7 @@ def _copy_table_data_with_retry(
|
|
|
677
707
|
report: "CloneReport",
|
|
678
708
|
deferred_indexes: dict[str, list[dict]],
|
|
679
709
|
truncate_oversized: bool = False,
|
|
680
|
-
) -> tuple[int, int, list[TruncatedValue]]:
|
|
710
|
+
) -> tuple[int, int, list[str], list[TruncatedValue]]:
|
|
681
711
|
"""Copy data for a single table with retry logic for index errors.
|
|
682
712
|
|
|
683
713
|
If a btree index size error occurs, this function will:
|
|
@@ -698,7 +728,7 @@ def _copy_table_data_with_retry(
|
|
|
698
728
|
truncate_oversized: If True, truncate oversized values instead of skipping rows.
|
|
699
729
|
|
|
700
730
|
Returns:
|
|
701
|
-
Tuple of (rows_copied, rows_skipped, truncated_values).
|
|
731
|
+
Tuple of (rows_copied, rows_skipped, skipped_rids, truncated_values).
|
|
702
732
|
rows_copied is -1 if the copy failed entirely.
|
|
703
733
|
"""
|
|
704
734
|
tname_uri = f"{urlquote(sname)}:{urlquote(tname)}"
|
|
@@ -711,6 +741,7 @@ def _copy_table_data_with_retry(
|
|
|
711
741
|
last = None
|
|
712
742
|
table_rows = 0
|
|
713
743
|
rows_skipped = 0
|
|
744
|
+
skipped_rids: list[str] = [] # Track RIDs of skipped rows
|
|
714
745
|
truncated_values: list[TruncatedValue] = []
|
|
715
746
|
row_by_row_mode = False
|
|
716
747
|
problematic_index = None
|
|
@@ -768,7 +799,7 @@ def _copy_table_data_with_retry(
|
|
|
768
799
|
).json()
|
|
769
800
|
except Exception as e:
|
|
770
801
|
logger.warning(f"Failed to read from {sname}:{tname}: {e}")
|
|
771
|
-
return -1, rows_skipped, truncated_values
|
|
802
|
+
return -1, rows_skipped, skipped_rids, truncated_values
|
|
772
803
|
|
|
773
804
|
if not page:
|
|
774
805
|
break
|
|
@@ -809,11 +840,14 @@ def _copy_table_data_with_retry(
|
|
|
809
840
|
|
|
810
841
|
rows_skipped += 1
|
|
811
842
|
rid = row.get('RID', 'unknown')
|
|
843
|
+
skipped_rids.append(rid)
|
|
812
844
|
logger.debug(f"Skipping row {rid} in {table_key} due to index size limit")
|
|
813
845
|
else:
|
|
814
846
|
# Different error - log and skip
|
|
815
847
|
rows_skipped += 1
|
|
816
|
-
|
|
848
|
+
rid = row.get('RID', 'unknown')
|
|
849
|
+
skipped_rids.append(rid)
|
|
850
|
+
logger.debug(f"Skipping row {rid} in {table_key}: {row_error}")
|
|
817
851
|
last = page[-1]['RID']
|
|
818
852
|
else:
|
|
819
853
|
# Normal batch mode
|
|
@@ -884,14 +918,17 @@ def _copy_table_data_with_retry(
|
|
|
884
918
|
|
|
885
919
|
rows_skipped += 1
|
|
886
920
|
rid = row.get('RID', 'unknown')
|
|
921
|
+
skipped_rids.append(rid)
|
|
887
922
|
logger.debug(f"Skipping row {rid} due to index size limit")
|
|
888
923
|
else:
|
|
889
924
|
rows_skipped += 1
|
|
890
|
-
|
|
925
|
+
rid = row.get('RID', 'unknown')
|
|
926
|
+
skipped_rids.append(rid)
|
|
927
|
+
logger.debug(f"Skipping row {rid}: {row_error}")
|
|
891
928
|
last = page[-1]['RID']
|
|
892
929
|
else:
|
|
893
930
|
logger.warning(f"Failed to write to {sname}:{tname}: {e}")
|
|
894
|
-
return -1, rows_skipped, truncated_values
|
|
931
|
+
return -1, rows_skipped, skipped_rids, truncated_values
|
|
895
932
|
|
|
896
933
|
# Report skipped rows
|
|
897
934
|
if rows_skipped > 0:
|
|
@@ -903,8 +940,9 @@ def _copy_table_data_with_retry(
|
|
|
903
940
|
details=f"Index '{problematic_index}' on column '{problematic_column}'",
|
|
904
941
|
action="These rows have values too large for btree index (>2704 bytes)",
|
|
905
942
|
row_count=rows_skipped,
|
|
943
|
+
skipped_rids=skipped_rids if skipped_rids else None,
|
|
906
944
|
))
|
|
907
|
-
logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits")
|
|
945
|
+
logger.warning(f"Skipped {rows_skipped} rows in {table_key} due to index size limits: RIDs={skipped_rids}")
|
|
908
946
|
|
|
909
947
|
# Report truncated values
|
|
910
948
|
if truncated_values:
|
|
@@ -919,7 +957,7 @@ def _copy_table_data_with_retry(
|
|
|
919
957
|
))
|
|
920
958
|
logger.info(f"Truncated {len(truncated_values)} values in {table_key}")
|
|
921
959
|
|
|
922
|
-
return table_rows, rows_skipped, truncated_values
|
|
960
|
+
return table_rows, rows_skipped, skipped_rids, truncated_values
|
|
923
961
|
|
|
924
962
|
|
|
925
963
|
|
|
@@ -1072,7 +1110,7 @@ def clone_catalog(
|
|
|
1072
1110
|
clone_timestamp = datetime.now(timezone.utc).isoformat()
|
|
1073
1111
|
|
|
1074
1112
|
# Perform the three-stage clone
|
|
1075
|
-
orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values = _clone_three_stage(
|
|
1113
|
+
orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, skipped_rids, truncated_values = _clone_three_stage(
|
|
1076
1114
|
src_catalog=src_catalog,
|
|
1077
1115
|
dst_catalog=dst_catalog,
|
|
1078
1116
|
copy_data=not schema_only,
|
|
@@ -1136,8 +1174,13 @@ def clone_catalog(
|
|
|
1136
1174
|
asset_mode=asset_mode.value,
|
|
1137
1175
|
exclude_schemas=exclude_schemas or [],
|
|
1138
1176
|
exclude_objects=exclude_objects or [],
|
|
1177
|
+
add_ml_schema=add_ml_schema,
|
|
1178
|
+
copy_annotations=copy_annotations,
|
|
1179
|
+
copy_policy=copy_policy,
|
|
1180
|
+
reinitialize_dataset_versions=reinitialize_dataset_versions,
|
|
1139
1181
|
rows_copied=total_rows_copied,
|
|
1140
1182
|
rows_skipped=rows_skipped,
|
|
1183
|
+
skipped_rids=skipped_rids,
|
|
1141
1184
|
truncated_count=len(truncated_values),
|
|
1142
1185
|
orphan_rows_removed=orphan_rows_removed,
|
|
1143
1186
|
orphan_rows_nullified=orphan_rows_nullified,
|
|
@@ -1186,10 +1229,10 @@ def _clone_three_stage(
|
|
|
1186
1229
|
prune_hidden_fkeys: bool,
|
|
1187
1230
|
truncate_oversized: bool,
|
|
1188
1231
|
report: CloneReport,
|
|
1189
|
-
) -> tuple[int, int, int, int, list[TruncatedValue]]:
|
|
1232
|
+
) -> tuple[int, int, int, int, list[str], list[TruncatedValue]]:
|
|
1190
1233
|
"""Perform three-stage catalog cloning.
|
|
1191
1234
|
|
|
1192
|
-
Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, truncated_values)
|
|
1235
|
+
Returns: (orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, rows_skipped, skipped_rids, truncated_values)
|
|
1193
1236
|
"""
|
|
1194
1237
|
src_model = src_catalog.getCatalogModel()
|
|
1195
1238
|
|
|
@@ -1328,6 +1371,7 @@ def _clone_three_stage(
|
|
|
1328
1371
|
# Stage 2: Copy data
|
|
1329
1372
|
total_rows = 0
|
|
1330
1373
|
total_rows_skipped = 0
|
|
1374
|
+
all_skipped_rids: list[str] = []
|
|
1331
1375
|
all_truncated_values: list[TruncatedValue] = []
|
|
1332
1376
|
deferred_indexes: dict[str, list[dict]] = {} # Track indexes dropped for later rebuild
|
|
1333
1377
|
|
|
@@ -1343,7 +1387,7 @@ def _clone_three_stage(
|
|
|
1343
1387
|
logger.debug(f"Copying data for {table_key}")
|
|
1344
1388
|
|
|
1345
1389
|
# Use the new copy function with index error handling
|
|
1346
|
-
table_rows, rows_skipped, truncated = _copy_table_data_with_retry(
|
|
1390
|
+
table_rows, rows_skipped, skipped_rids, truncated = _copy_table_data_with_retry(
|
|
1347
1391
|
src_catalog=src_catalog,
|
|
1348
1392
|
dst_catalog=dst_catalog,
|
|
1349
1393
|
sname=sname,
|
|
@@ -1355,6 +1399,7 @@ def _clone_three_stage(
|
|
|
1355
1399
|
)
|
|
1356
1400
|
|
|
1357
1401
|
total_rows_skipped += rows_skipped
|
|
1402
|
+
all_skipped_rids.extend(skipped_rids)
|
|
1358
1403
|
all_truncated_values.extend(truncated)
|
|
1359
1404
|
|
|
1360
1405
|
if table_rows < 0:
|
|
@@ -1581,7 +1626,7 @@ def _clone_three_stage(
|
|
|
1581
1626
|
if copy_annotations or copy_policy:
|
|
1582
1627
|
_copy_configuration(src_model, dst_catalog, copy_annotations, copy_policy, exclude_schemas, excluded_tables)
|
|
1583
1628
|
|
|
1584
|
-
return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_truncated_values
|
|
1629
|
+
return orphan_rows_removed, orphan_rows_nullified, fkeys_pruned, total_rows_skipped, all_skipped_rids, all_truncated_values
|
|
1585
1630
|
|
|
1586
1631
|
|
|
1587
1632
|
def _identify_orphan_values(
|
|
@@ -1892,12 +1937,37 @@ def _post_clone_operations(
|
|
|
1892
1937
|
|
|
1893
1938
|
if add_ml_schema:
|
|
1894
1939
|
try:
|
|
1895
|
-
from deriva_ml.schema import
|
|
1940
|
+
from deriva_ml.schema import create_ml_schema
|
|
1896
1941
|
catalog = server.connect_ermrest(result.catalog_id)
|
|
1897
|
-
|
|
1942
|
+
create_ml_schema(catalog)
|
|
1898
1943
|
result.ml_schema_added = True
|
|
1944
|
+
|
|
1945
|
+
# Apply catalog annotations (chaise-config, navbar, etc.)
|
|
1946
|
+
try:
|
|
1947
|
+
from deriva_ml import DerivaML
|
|
1948
|
+
ml = DerivaML(result.hostname, result.catalog_id, check_auth=False)
|
|
1949
|
+
ml.apply_catalog_annotations()
|
|
1950
|
+
logger.info("Applied catalog annotations (chaise-config, navbar)")
|
|
1951
|
+
except Exception as e:
|
|
1952
|
+
logger.warning(f"Failed to apply catalog annotations: {e}")
|
|
1953
|
+
if result.report:
|
|
1954
|
+
result.report.add_issue(CloneIssue(
|
|
1955
|
+
severity=CloneIssueSeverity.WARNING,
|
|
1956
|
+
category=CloneIssueCategory.SCHEMA_ISSUE,
|
|
1957
|
+
message="Failed to apply catalog annotations",
|
|
1958
|
+
details=str(e),
|
|
1959
|
+
action="Manually call apply_catalog_annotations() after clone",
|
|
1960
|
+
))
|
|
1899
1961
|
except Exception as e:
|
|
1900
1962
|
logger.warning(f"Failed to add ML schema: {e}")
|
|
1963
|
+
if result.report:
|
|
1964
|
+
result.report.add_issue(CloneIssue(
|
|
1965
|
+
severity=CloneIssueSeverity.ERROR,
|
|
1966
|
+
category=CloneIssueCategory.SCHEMA_ISSUE,
|
|
1967
|
+
message="Failed to add DerivaML schema",
|
|
1968
|
+
details=str(e),
|
|
1969
|
+
action="ML schema was not added to the clone",
|
|
1970
|
+
))
|
|
1901
1971
|
|
|
1902
1972
|
return result
|
|
1903
1973
|
|
|
@@ -11,7 +11,7 @@ deriva_ml/asset/__init__.py,sha256=YuV0rFEL0kMDzB8W-qWiUs6HahEadiaYWuS-d3OcoMw,4
|
|
|
11
11
|
deriva_ml/asset/asset.py,sha256=A8938V8iVufOzk5HdDxm5If1OkaLX1YJqQw-K-Um2rI,13489
|
|
12
12
|
deriva_ml/asset/aux_classes.py,sha256=QIH_pd3koIG04fb-gzHVgdKtykfVgDGJH3F7RN3-dwg,3486
|
|
13
13
|
deriva_ml/catalog/__init__.py,sha256=4rIVLhv8tlA3k5VEu8CrvCAkY22WTy36FgDtQBUjvNc,700
|
|
14
|
-
deriva_ml/catalog/clone.py,sha256=
|
|
14
|
+
deriva_ml/catalog/clone.py,sha256=4HOae4w94dkB4nmib_bCLwcOK1Pzqb4uFzcGfsbG6Sw,78285
|
|
15
15
|
deriva_ml/catalog/localize.py,sha256=-YNvB_dYo0RjoI-VDj2Yu_qFB8TeAFPHfOJTYMTMYF8,14981
|
|
16
16
|
deriva_ml/core/__init__.py,sha256=oqWgo4ckyAfebeXBQXJ9O8ans81tbmzPRnsVHLeVXT8,2000
|
|
17
17
|
deriva_ml/core/base.py,sha256=THdHOrTp7Rk0DxyzHW4PildQixn8Z-mqP1jCWMMgtxY,57135
|
|
@@ -69,9 +69,9 @@ deriva_ml/schema/deriva-ml-reference.json,sha256=AEOMIgwKO3dNMMWHb0lxaXyamvfAEbU
|
|
|
69
69
|
deriva_ml/schema/policy.json,sha256=5ykB8nnZFl-oCHzlAwppCFKJHWJFIkYognUMVEanfY8,1826
|
|
70
70
|
deriva_ml/schema/table_comments_utils.py,sha256=4flCqnZAaqg_uSZ9I18pNUWAZoLfmMCXbmI5uERY5vM,2007
|
|
71
71
|
deriva_ml/schema/validation.py,sha256=C0TvWj2kjOj40w1N5FIWp55DWPdLPN8tk3JJfN5ezW4,19912
|
|
72
|
-
deriva_ml-1.17.
|
|
73
|
-
deriva_ml-1.17.
|
|
74
|
-
deriva_ml-1.17.
|
|
75
|
-
deriva_ml-1.17.
|
|
76
|
-
deriva_ml-1.17.
|
|
77
|
-
deriva_ml-1.17.
|
|
72
|
+
deriva_ml-1.17.15.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
73
|
+
deriva_ml-1.17.15.dist-info/METADATA,sha256=R9IPcPbmcE6SJsyo9wwoxxsksaIsXLXTc9XVBF5c-EU,1216
|
|
74
|
+
deriva_ml-1.17.15.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
75
|
+
deriva_ml-1.17.15.dist-info/entry_points.txt,sha256=nwRBpDI6yGUMhvEJG__O0LHz6JovazaVXhykvSNF4og,554
|
|
76
|
+
deriva_ml-1.17.15.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
77
|
+
deriva_ml-1.17.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|