dcicutils 8.8.0.1b5__py3-none-any.whl → 8.8.0.1b6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- dcicutils/structured_data.py +72 -42
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/METADATA +1 -1
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/RECORD +6 -6
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/LICENSE.txt +0 -0
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/WHEEL +0 -0
- {dcicutils-8.8.0.1b5.dist-info → dcicutils-8.8.0.1b6.dist-info}/entry_points.txt +0 -0
dcicutils/structured_data.py
CHANGED
@@ -12,7 +12,7 @@ from dcicutils.common import OrchestratedApp
|
|
12
12
|
from dcicutils.data_readers import CsvReader, Excel, RowReader
|
13
13
|
from dcicutils.datetime_utils import normalize_date_string, normalize_datetime_string
|
14
14
|
from dcicutils.file_utils import search_for_file
|
15
|
-
from dcicutils.misc_utils import (create_dict, create_readonly_object,
|
15
|
+
from dcicutils.misc_utils import (create_dict, create_readonly_object, load_json_if,
|
16
16
|
merge_objects, remove_empty_properties, right_trim,
|
17
17
|
split_string, to_boolean, to_enum, to_float, to_integer, VirtualApp)
|
18
18
|
from dcicutils.portal_object_utils import PortalObject
|
@@ -53,6 +53,10 @@ class StructuredDataSet:
|
|
53
53
|
# can choose to lookup root path first, or not lookup root path at all, or not lookup
|
54
54
|
# subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
|
55
55
|
# and value (string) arguements and return an integer of any of the below ORed together.
|
56
|
+
# The main purpose of this is optimization; to minimum portal lookups; since for example,
|
57
|
+
# currently at least, /{type}/{accession} does not work but /{accession} does; so we
|
58
|
+
# currently (smaht-portal/.../ingestion_processors) use REF_LOOKUP_ROOT_FIRST for this.
|
59
|
+
# And current usage NEVER has REF_LOOKUP_SUBTYPES turned OFF; but support just in case.
|
56
60
|
REF_LOOKUP_ROOT = 0x0001
|
57
61
|
REF_LOOKUP_ROOT_FIRST = 0x0002 | REF_LOOKUP_ROOT
|
58
62
|
REF_LOOKUP_SUBTYPES = 0x0004
|
@@ -228,8 +232,10 @@ class StructuredDataSet:
|
|
228
232
|
if ref_errors := self.ref_errors:
|
229
233
|
ref_errors_actual = []
|
230
234
|
for ref_error in ref_errors:
|
231
|
-
if not self.portal.ref_exists(ref_error["error"]):
|
235
|
+
if not (resolved := self.portal.ref_exists(ref := ref_error["error"])):
|
232
236
|
ref_errors_actual.append(ref_error)
|
237
|
+
else:
|
238
|
+
self._resolved_refs.add((ref, resolved[0].get("uuid")))
|
233
239
|
if ref_errors_actual:
|
234
240
|
self._errors["ref"] = ref_errors_actual
|
235
241
|
else:
|
@@ -565,7 +571,7 @@ class Schema(SchemaBase):
|
|
565
571
|
the names of any nested properties (i.e objects within objects) flattened into a single
|
566
572
|
property name in dot notation; and set the value of each of these flat property names
|
567
573
|
to the type of the terminal/leaf value of the (either) top-level or nested type. N.B. We
|
568
|
-
do NOT currently support array-of-
|
574
|
+
do NOT currently support array-of-array or array-of-multiple-types. E.g. for this schema:
|
569
575
|
|
570
576
|
{ "properties": {
|
571
577
|
"abc": {
|
@@ -783,66 +789,90 @@ class Portal(PortalBase):
|
|
783
789
|
return self._ref_cache.get(f"/{type_name}/{value}", None)
|
784
790
|
return None
|
785
791
|
|
786
|
-
def _cache_ref(self, type_name: str, value: str, resolved: List[str],
|
787
|
-
subtype_names: Optional[List[str]]) -> None:
|
792
|
+
def _cache_ref(self, type_name: str, value: str, resolved: List[str], subtype_names: Optional[List[str]]) -> None:
|
788
793
|
if self._ref_cache is not None:
|
789
|
-
for type_name in [type_name] + (subtype_names
|
790
|
-
|
791
|
-
if self._ref_cache.get(object_path, None) is None:
|
792
|
-
self._ref_cache[object_path] = resolved
|
794
|
+
for type_name in [type_name] + (subtype_names if subtype_names else []):
|
795
|
+
self._ref_cache[f"/{type_name}/{value}"] = resolved
|
793
796
|
|
794
797
|
def ref_exists(self, type_name: str, value: Optional[str] = None) -> List[dict]:
|
795
798
|
if not value:
|
796
799
|
if type_name.startswith("/") and len(parts := type_name[1:].split("/")) == 2:
|
797
|
-
type_name
|
798
|
-
|
800
|
+
if not (type_name := parts[0]) or not (value := parts[1]):
|
801
|
+
return []
|
799
802
|
else:
|
800
|
-
return []
|
803
|
+
return []
|
801
804
|
if (resolved := self._ref_exists_from_cache(type_name, value)) is not None:
|
805
|
+
# Found cached resolved reference.
|
806
|
+
if not resolved:
|
807
|
+
# Cached resolved reference is empty ([]).
|
808
|
+
# It might NOW be found internally, since the portal self._data can change.
|
809
|
+
# TODO
|
810
|
+
ref_lookup_strategy = self._ref_lookup_strategy(type_name, value)
|
811
|
+
is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
|
812
|
+
subtype_names = self._get_schema_subtypes(type_name) if is_ref_lookup_subtypes else None
|
813
|
+
is_resolved, resolved_uuid = self._ref_exists_internally(type_name, value, subtype_names)
|
814
|
+
if is_resolved:
|
815
|
+
resolved = [{"type": type_name, "uuid": resolved_uuid}]
|
816
|
+
self._cache_ref(type_name, value, resolved, subtype_names)
|
817
|
+
return resolved
|
802
818
|
self._ref_exists_cache_hit_count += 1
|
803
819
|
return resolved
|
804
820
|
# Not cached here.
|
805
821
|
self._ref_exists_cache_miss_count += 1
|
806
|
-
|
822
|
+
# Get the lookup strategy.
|
807
823
|
ref_lookup_strategy = self._ref_lookup_strategy(type_name, value)
|
808
824
|
is_ref_lookup_root = StructuredDataSet._is_ref_lookup_root(ref_lookup_strategy)
|
809
825
|
is_ref_lookup_root_first = StructuredDataSet._is_ref_lookup_root_first(ref_lookup_strategy)
|
810
826
|
is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
|
811
|
-
|
812
|
-
|
827
|
+
subtype_names = self._get_schema_subtypes(type_name) if is_ref_lookup_subtypes else None
|
828
|
+
# Lookup internally first (including at subtypes if desired).
|
829
|
+
is_resolved, resolved_uuid = self._ref_exists_internally(type_name, value, subtype_names)
|
830
|
+
if is_resolved:
|
831
|
+
resolved = [{"type": type_name, "uuid": resolved_uuid}]
|
832
|
+
self._cache_ref(type_name, value, resolved, subtype_names)
|
833
|
+
return resolved
|
834
|
+
# Not found internally; perform actual portal lookup (included at root and subtypes if desired).
|
835
|
+
# First construct the list of lookup paths at which to look for the referenced item.
|
836
|
+
lookup_paths = []
|
813
837
|
if is_ref_lookup_root_first:
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
838
|
+
lookup_paths.append(f"/{value}")
|
839
|
+
lookup_paths.append(f"/{type_name}/{value}")
|
840
|
+
if is_ref_lookup_root and not is_ref_lookup_root_first:
|
841
|
+
lookup_paths.append(f"/{value}")
|
842
|
+
if subtype_names:
|
843
|
+
for subtype_name in subtype_names:
|
844
|
+
lookup_paths.append(f"/{subtype_name}/{value}")
|
845
|
+
# Do the actual lookup in the portal for each of the desired lookup paths.
|
846
|
+
for lookup_path in lookup_paths:
|
847
|
+
if isinstance(item := self.get_metadata(lookup_path), dict):
|
848
|
+
resolved = [{"type": type_name, "uuid": item.get("uuid", None)}]
|
849
|
+
self._cache_ref(type_name, value, resolved, subtype_names)
|
850
|
+
return resolved
|
851
|
+
return []
|
852
|
+
|
853
|
+
def _ref_exists_internally(self, type_name: str, value: str,
|
854
|
+
subtype_names: Optional[List[str]] = None) -> Tuple[bool, Optional[str]]:
|
855
|
+
is_resolved, resolved_uuid = self._ref_exists_single_internally(type_name, value)
|
819
856
|
if is_resolved:
|
820
|
-
|
821
|
-
|
822
|
-
elif subtype_names and is_ref_lookup_subtypes:
|
857
|
+
return True, resolved_uuid
|
858
|
+
if subtype_names:
|
823
859
|
for subtype_name in subtype_names:
|
824
|
-
is_resolved, resolved_uuid = self.
|
860
|
+
is_resolved, resolved_uuid = self._ref_exists_single_internally(subtype_name, value)
|
825
861
|
if is_resolved:
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
# but TODO - do not think we should do this anymore - maybe test changes needed).
|
831
|
-
self._cache_ref(type_name, value, resolved, subtype_names)
|
832
|
-
return resolved
|
833
|
-
|
834
|
-
def _ref_exists_single(self, type_name: str, value: str, root: bool = False) -> Tuple[bool, Optional[str]]:
|
835
|
-
# Check first in our own data (i.e. e.g. within the given spreadsheet).
|
862
|
+
return True, resolved_uuid
|
863
|
+
return False, None
|
864
|
+
|
865
|
+
def _ref_exists_single_internally(self, type_name: str, value: str) -> Tuple[bool, Optional[str]]:
|
836
866
|
if self._data and (items := self._data.get(type_name)) and (schema := self.get_schema(type_name)):
|
837
|
-
|
867
|
+
identifying_properties = set(schema.get("identifyingProperties", [])) | {"identifier", "uuid"}
|
838
868
|
for item in items:
|
839
|
-
|
840
|
-
if
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
return
|
869
|
+
for identifying_property in identifying_properties:
|
870
|
+
if (identifying_value := item.get(identifying_property, None)) is not None:
|
871
|
+
if ((identifying_value == value) or
|
872
|
+
(isinstance(identifying_value, list) and (value in identifying_value))): # noqa
|
873
|
+
self._ref_exists_internal_count += 1
|
874
|
+
return True, item.get("uuid", None)
|
875
|
+
return False, None
|
846
876
|
|
847
877
|
@property
|
848
878
|
def ref_lookup_cache_hit_count(self) -> int:
|
@@ -62,15 +62,15 @@ dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19
|
|
62
62
|
dcicutils/sheet_utils.py,sha256=VlmzteONW5VF_Q4vo0yA5vesz1ViUah1MZ_yA1rwZ0M,33629
|
63
63
|
dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
|
64
64
|
dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
|
65
|
-
dcicutils/structured_data.py,sha256=
|
65
|
+
dcicutils/structured_data.py,sha256=7JDesiA0geGkP343yV3z9Bkc8qN22RKoT20cHrecEYA,47985
|
66
66
|
dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
|
67
67
|
dcicutils/tmpfile_utils.py,sha256=n95XF8dZVbQRSXBZTGToXXfSs3JUVRyN6c3ZZ0nhAWI,1403
|
68
68
|
dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
|
69
69
|
dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
|
70
70
|
dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
|
71
71
|
dcicutils/zip_utils.py,sha256=rnjNv_k6L9jT2SjDSgVXp4BEJYLtz9XN6Cl2Fy-tqnM,2027
|
72
|
-
dcicutils-8.8.0.
|
73
|
-
dcicutils-8.8.0.
|
74
|
-
dcicutils-8.8.0.
|
75
|
-
dcicutils-8.8.0.
|
76
|
-
dcicutils-8.8.0.
|
72
|
+
dcicutils-8.8.0.1b6.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
|
73
|
+
dcicutils-8.8.0.1b6.dist-info/METADATA,sha256=-MVcTLgcFRea1f0P8L91J8zmo1wbjbUbPr-V82goavo,3356
|
74
|
+
dcicutils-8.8.0.1b6.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
75
|
+
dcicutils-8.8.0.1b6.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
|
76
|
+
dcicutils-8.8.0.1b6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|