dcicutils 8.8.0.1b5__py3-none-any.whl → 8.8.0.1b6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcicutils/structured_data.py +72 -42
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/METADATA +1 -1
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/RECORD +6 -6
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/LICENSE.txt +0 -0
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/WHEEL +0 -0
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/entry_points.txt +0 -0
dcicutils/structured_data.py
CHANGED
@@ -12,7 +12,7 @@ from dcicutils.common import OrchestratedApp
|
|
12
12
|
from dcicutils.data_readers import CsvReader, Excel, RowReader
|
13
13
|
from dcicutils.datetime_utils import normalize_date_string, normalize_datetime_string
|
14
14
|
from dcicutils.file_utils import search_for_file
|
15
|
-
from dcicutils.misc_utils import (create_dict, create_readonly_object,
|
15
|
+
from dcicutils.misc_utils import (create_dict, create_readonly_object, load_json_if,
|
16
16
|
merge_objects, remove_empty_properties, right_trim,
|
17
17
|
split_string, to_boolean, to_enum, to_float, to_integer, VirtualApp)
|
18
18
|
from dcicutils.portal_object_utils import PortalObject
|
@@ -53,6 +53,10 @@ class StructuredDataSet:
|
|
53
53
|
# can choose to lookup root path first, or not lookup root path at all, or not lookup
|
54
54
|
# subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
|
55
55
|
# and value (string) arguements and return an integer of any of the below ORed together.
|
56
|
+
# The main purpose of this is optimization; to minimum portal lookups; since for example,
|
57
|
+
# currently at least, /{type}/{accession} does not work but /{accession} does; so we
|
58
|
+
# currently (smaht-portal/.../ingestion_processors) use REF_LOOKUP_ROOT_FIRST for this.
|
59
|
+
# And current usage NEVER has REF_LOOKUP_SUBTYPES turned OFF; but support just in case.
|
56
60
|
REF_LOOKUP_ROOT = 0x0001
|
57
61
|
REF_LOOKUP_ROOT_FIRST = 0x0002 | REF_LOOKUP_ROOT
|
58
62
|
REF_LOOKUP_SUBTYPES = 0x0004
|
@@ -228,8 +232,10 @@ class StructuredDataSet:
|
|
228
232
|
if ref_errors := self.ref_errors:
|
229
233
|
ref_errors_actual = []
|
230
234
|
for ref_error in ref_errors:
|
231
|
-
if not self.portal.ref_exists(ref_error["error"]):
|
235
|
+
if not (resolved := self.portal.ref_exists(ref := ref_error["error"])):
|
232
236
|
ref_errors_actual.append(ref_error)
|
237
|
+
else:
|
238
|
+
self._resolved_refs.add((ref, resolved[0].get("uuid")))
|
233
239
|
if ref_errors_actual:
|
234
240
|
self._errors["ref"] = ref_errors_actual
|
235
241
|
else:
|
@@ -565,7 +571,7 @@ class Schema(SchemaBase):
|
|
565
571
|
the names of any nested properties (i.e objects within objects) flattened into a single
|
566
572
|
property name in dot notation; and set the value of each of these flat property names
|
567
573
|
to the type of the terminal/leaf value of the (either) top-level or nested type. N.B. We
|
568
|
-
do NOT currently support array-of-
|
574
|
+
do NOT currently support array-of-array or array-of-multiple-types. E.g. for this schema:
|
569
575
|
|
570
576
|
{ "properties": {
|
571
577
|
"abc": {
|
@@ -783,66 +789,90 @@ class Portal(PortalBase):
|
|
783
789
|
return self._ref_cache.get(f"/{type_name}/{value}", None)
|
784
790
|
return None
|
785
791
|
|
786
|
-
def _cache_ref(self, type_name: str, value: str, resolved: List[str],
|
787
|
-
subtype_names: Optional[List[str]]) -> None:
|
792
|
+
def _cache_ref(self, type_name: str, value: str, resolved: List[str], subtype_names: Optional[List[str]]) -> None:
|
788
793
|
if self._ref_cache is not None:
|
789
|
-
for type_name in [type_name] + (subtype_names
|
790
|
-
|
791
|
-
if self._ref_cache.get(object_path, None) is None:
|
792
|
-
self._ref_cache[object_path] = resolved
|
794
|
+
for type_name in [type_name] + (subtype_names if subtype_names else []):
|
795
|
+
self._ref_cache[f"/{type_name}/{value}"] = resolved
|
793
796
|
|
794
797
|
def ref_exists(self, type_name: str, value: Optional[str] = None) -> List[dict]:
|
795
798
|
if not value:
|
796
799
|
if type_name.startswith("/") and len(parts := type_name[1:].split("/")) == 2:
|
797
|
-
type_name
|
798
|
-
|
800
|
+
if not (type_name := parts[0]) or not (value := parts[1]):
|
801
|
+
return []
|
799
802
|
else:
|
800
|
-
return []
|
803
|
+
return []
|
801
804
|
if (resolved := self._ref_exists_from_cache(type_name, value)) is not None:
|
805
|
+
# Found cached resolved reference.
|
806
|
+
if not resolved:
|
807
|
+
# Cached resolved reference is empty ([]).
|
808
|
+
# It might NOW be found internally, since the portal self._data can change.
|
809
|
+
# TODO
|
810
|
+
ref_lookup_strategy = self._ref_lookup_strategy(type_name, value)
|
811
|
+
is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
|
812
|
+
subtype_names = self._get_schema_subtypes(type_name) if is_ref_lookup_subtypes else None
|
813
|
+
is_resolved, resolved_uuid = self._ref_exists_internally(type_name, value, subtype_names)
|
814
|
+
if is_resolved:
|
815
|
+
resolved = [{"type": type_name, "uuid": resolved_uuid}]
|
816
|
+
self._cache_ref(type_name, value, resolved, subtype_names)
|
817
|
+
return resolved
|
802
818
|
self._ref_exists_cache_hit_count += 1
|
803
819
|
return resolved
|
804
820
|
# Not cached here.
|
805
821
|
self._ref_exists_cache_miss_count += 1
|
806
|
-
|
822
|
+
# Get the lookup strategy.
|
807
823
|
ref_lookup_strategy = self._ref_lookup_strategy(type_name, value)
|
808
824
|
is_ref_lookup_root = StructuredDataSet._is_ref_lookup_root(ref_lookup_strategy)
|
809
825
|
is_ref_lookup_root_first = StructuredDataSet._is_ref_lookup_root_first(ref_lookup_strategy)
|
810
826
|
is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
|
811
|
-
|
812
|
-
|
827
|
+
subtype_names = self._get_schema_subtypes(type_name) if is_ref_lookup_subtypes else None
|
828
|
+
# Lookup internally first (including at subtypes if desired).
|
829
|
+
is_resolved, resolved_uuid = self._ref_exists_internally(type_name, value, subtype_names)
|
830
|
+
if is_resolved:
|
831
|
+
resolved = [{"type": type_name, "uuid": resolved_uuid}]
|
832
|
+
self._cache_ref(type_name, value, resolved, subtype_names)
|
833
|
+
return resolved
|
834
|
+
# Not found internally; perform actual portal lookup (included at root and subtypes if desired).
|
835
|
+
# First construct the list of lookup paths at which to look for the referenced item.
|
836
|
+
lookup_paths = []
|
813
837
|
if is_ref_lookup_root_first:
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
838
|
+
lookup_paths.append(f"/{value}")
|
839
|
+
lookup_paths.append(f"/{type_name}/{value}")
|
840
|
+
if is_ref_lookup_root and not is_ref_lookup_root_first:
|
841
|
+
lookup_paths.append(f"/{value}")
|
842
|
+
if subtype_names:
|
843
|
+
for subtype_name in subtype_names:
|
844
|
+
lookup_paths.append(f"/{subtype_name}/{value}")
|
845
|
+
# Do the actual lookup in the portal for each of the desired lookup paths.
|
846
|
+
for lookup_path in lookup_paths:
|
847
|
+
if isinstance(item := self.get_metadata(lookup_path), dict):
|
848
|
+
resolved = [{"type": type_name, "uuid": item.get("uuid", None)}]
|
849
|
+
self._cache_ref(type_name, value, resolved, subtype_names)
|
850
|
+
return resolved
|
851
|
+
return []
|
852
|
+
|
853
|
+
def _ref_exists_internally(self, type_name: str, value: str,
|
854
|
+
subtype_names: Optional[List[str]] = None) -> Tuple[bool, Optional[str]]:
|
855
|
+
is_resolved, resolved_uuid = self._ref_exists_single_internally(type_name, value)
|
819
856
|
if is_resolved:
|
820
|
-
|
821
|
-
|
822
|
-
elif subtype_names and is_ref_lookup_subtypes:
|
857
|
+
return True, resolved_uuid
|
858
|
+
if subtype_names:
|
823
859
|
for subtype_name in subtype_names:
|
824
|
-
is_resolved, resolved_uuid = self.
|
860
|
+
is_resolved, resolved_uuid = self._ref_exists_single_internally(subtype_name, value)
|
825
861
|
if is_resolved:
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
# but TODO - do not think we should do this anymore - maybe test changes needed).
|
831
|
-
self._cache_ref(type_name, value, resolved, subtype_names)
|
832
|
-
return resolved
|
833
|
-
|
834
|
-
def _ref_exists_single(self, type_name: str, value: str, root: bool = False) -> Tuple[bool, Optional[str]]:
|
835
|
-
# Check first in our own data (i.e. e.g. within the given spreadsheet).
|
862
|
+
return True, resolved_uuid
|
863
|
+
return False, None
|
864
|
+
|
865
|
+
def _ref_exists_single_internally(self, type_name: str, value: str) -> Tuple[bool, Optional[str]]:
|
836
866
|
if self._data and (items := self._data.get(type_name)) and (schema := self.get_schema(type_name)):
|
837
|
-
|
867
|
+
identifying_properties = set(schema.get("identifyingProperties", [])) | {"identifier", "uuid"}
|
838
868
|
for item in items:
|
839
|
-
|
840
|
-
if
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
return
|
869
|
+
for identifying_property in identifying_properties:
|
870
|
+
if (identifying_value := item.get(identifying_property, None)) is not None:
|
871
|
+
if ((identifying_value == value) or
|
872
|
+
(isinstance(identifying_value, list) and (value in identifying_value))): # noqa
|
873
|
+
self._ref_exists_internal_count += 1
|
874
|
+
return True, item.get("uuid", None)
|
875
|
+
return False, None
|
846
876
|
|
847
877
|
@property
|
848
878
|
def ref_lookup_cache_hit_count(self) -> int:
|
@@ -62,15 +62,15 @@ dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19
|
|
62
62
|
dcicutils/sheet_utils.py,sha256=VlmzteONW5VF_Q4vo0yA5vesz1ViUah1MZ_yA1rwZ0M,33629
|
63
63
|
dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
|
64
64
|
dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
|
65
|
-
dcicutils/structured_data.py,sha256=
|
65
|
+
dcicutils/structured_data.py,sha256=7JDesiA0geGkP343yV3z9Bkc8qN22RKoT20cHrecEYA,47985
|
66
66
|
dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
|
67
67
|
dcicutils/tmpfile_utils.py,sha256=n95XF8dZVbQRSXBZTGToXXfSs3JUVRyN6c3ZZ0nhAWI,1403
|
68
68
|
dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
|
69
69
|
dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
|
70
70
|
dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
|
71
71
|
dcicutils/zip_utils.py,sha256=rnjNv_k6L9jT2SjDSgVXp4BEJYLtz9XN6Cl2Fy-tqnM,2027
|
72
|
-
dcicutils-8.8.0.
|
73
|
-
dcicutils-8.8.0.
|
74
|
-
dcicutils-8.8.0.
|
75
|
-
dcicutils-8.8.0.
|
76
|
-
dcicutils-8.8.0.
|
72
|
+
dcicutils-8.8.0.1b6.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
|
73
|
+
dcicutils-8.8.0.1b6.dist-info/METADATA,sha256=-MVcTLgcFRea1f0P8L91J8zmo1wbjbUbPr-V82goavo,3356
|
74
|
+
dcicutils-8.8.0.1b6.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
75
|
+
dcicutils-8.8.0.1b6.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
|
76
|
+
dcicutils-8.8.0.1b6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|