dcicutils 8.8.0.1b4__py3-none-any.whl → 8.8.0.1b6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcicutils/structured_data.py +84 -42
- {dcicutils-8.8.0.1b4.dist-info → dcicutils-8.8.0.1b6.dist-info}/METADATA +1 -1
- {dcicutils-8.8.0.1b4.dist-info → dcicutils-8.8.0.1b6.dist-info}/RECORD +6 -6
- {dcicutils-8.8.0.1b4.dist-info → dcicutils-8.8.0.1b6.dist-info}/LICENSE.txt +0 -0
- {dcicutils-8.8.0.1b4.dist-info → dcicutils-8.8.0.1b6.dist-info}/WHEEL +0 -0
- {dcicutils-8.8.0.1b4.dist-info → dcicutils-8.8.0.1b6.dist-info}/entry_points.txt +0 -0
dcicutils/structured_data.py
CHANGED
@@ -12,7 +12,7 @@ from dcicutils.common import OrchestratedApp
|
|
12
12
|
from dcicutils.data_readers import CsvReader, Excel, RowReader
|
13
13
|
from dcicutils.datetime_utils import normalize_date_string, normalize_datetime_string
|
14
14
|
from dcicutils.file_utils import search_for_file
|
15
|
-
from dcicutils.misc_utils import (create_dict, create_readonly_object,
|
15
|
+
from dcicutils.misc_utils import (create_dict, create_readonly_object, load_json_if,
|
16
16
|
merge_objects, remove_empty_properties, right_trim,
|
17
17
|
split_string, to_boolean, to_enum, to_float, to_integer, VirtualApp)
|
18
18
|
from dcicutils.portal_object_utils import PortalObject
|
@@ -53,6 +53,10 @@ class StructuredDataSet:
|
|
53
53
|
# can choose to lookup root path first, or not lookup root path at all, or not lookup
|
54
54
|
# subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
|
55
55
|
# and value (string) arguements and return an integer of any of the below ORed together.
|
56
|
+
# The main purpose of this is optimization; to minimum portal lookups; since for example,
|
57
|
+
# currently at least, /{type}/{accession} does not work but /{accession} does; so we
|
58
|
+
# currently (smaht-portal/.../ingestion_processors) use REF_LOOKUP_ROOT_FIRST for this.
|
59
|
+
# And current usage NEVER has REF_LOOKUP_SUBTYPES turned OFF; but support just in case.
|
56
60
|
REF_LOOKUP_ROOT = 0x0001
|
57
61
|
REF_LOOKUP_ROOT_FIRST = 0x0002 | REF_LOOKUP_ROOT
|
58
62
|
REF_LOOKUP_SUBTYPES = 0x0004
|
@@ -228,8 +232,10 @@ class StructuredDataSet:
|
|
228
232
|
if ref_errors := self.ref_errors:
|
229
233
|
ref_errors_actual = []
|
230
234
|
for ref_error in ref_errors:
|
231
|
-
if not self.portal.ref_exists(ref_error["error"]):
|
235
|
+
if not (resolved := self.portal.ref_exists(ref := ref_error["error"])):
|
232
236
|
ref_errors_actual.append(ref_error)
|
237
|
+
else:
|
238
|
+
self._resolved_refs.add((ref, resolved[0].get("uuid")))
|
233
239
|
if ref_errors_actual:
|
234
240
|
self._errors["ref"] = ref_errors_actual
|
235
241
|
else:
|
@@ -291,6 +297,10 @@ class StructuredDataSet:
|
|
291
297
|
def ref_lookup_cache_miss_count(self) -> int:
|
292
298
|
return self.portal.ref_lookup_cache_miss_count if self.portal else -1
|
293
299
|
|
300
|
+
@property
|
301
|
+
def ref_lookup_count(self) -> int:
|
302
|
+
return self.portal.ref_lookup_count if self.portal else -1
|
303
|
+
|
294
304
|
@property
|
295
305
|
def ref_lookup_found_count(self) -> int:
|
296
306
|
return self.portal.ref_lookup_found_count if self.portal else -1
|
@@ -561,7 +571,7 @@ class Schema(SchemaBase):
|
|
561
571
|
the names of any nested properties (i.e objects within objects) flattened into a single
|
562
572
|
property name in dot notation; and set the value of each of these flat property names
|
563
573
|
to the type of the terminal/leaf value of the (either) top-level or nested type. N.B. We
|
564
|
-
do NOT currently support array-of-
|
574
|
+
do NOT currently support array-of-array or array-of-multiple-types. E.g. for this schema:
|
565
575
|
|
566
576
|
{ "properties": {
|
567
577
|
"abc": {
|
@@ -779,69 +789,95 @@ class Portal(PortalBase):
|
|
779
789
|
return self._ref_cache.get(f"/{type_name}/{value}", None)
|
780
790
|
return None
|
781
791
|
|
782
|
-
def _cache_ref(self, type_name: str, value: str, resolved: List[str],
|
783
|
-
subtype_names: Optional[List[str]]) -> None:
|
792
|
+
def _cache_ref(self, type_name: str, value: str, resolved: List[str], subtype_names: Optional[List[str]]) -> None:
|
784
793
|
if self._ref_cache is not None:
|
785
|
-
for type_name in [type_name] + (subtype_names
|
786
|
-
|
787
|
-
if self._ref_cache.get(object_path, None) is None:
|
788
|
-
self._ref_cache[object_path] = resolved
|
794
|
+
for type_name in [type_name] + (subtype_names if subtype_names else []):
|
795
|
+
self._ref_cache[f"/{type_name}/{value}"] = resolved
|
789
796
|
|
790
797
|
def ref_exists(self, type_name: str, value: Optional[str] = None) -> List[dict]:
|
791
798
|
if not value:
|
792
799
|
if type_name.startswith("/") and len(parts := type_name[1:].split("/")) == 2:
|
793
|
-
type_name
|
794
|
-
|
800
|
+
if not (type_name := parts[0]) or not (value := parts[1]):
|
801
|
+
return []
|
795
802
|
else:
|
796
|
-
return []
|
803
|
+
return []
|
797
804
|
if (resolved := self._ref_exists_from_cache(type_name, value)) is not None:
|
805
|
+
# Found cached resolved reference.
|
806
|
+
if not resolved:
|
807
|
+
# Cached resolved reference is empty ([]).
|
808
|
+
# It might NOW be found internally, since the portal self._data can change.
|
809
|
+
# TODO
|
810
|
+
ref_lookup_strategy = self._ref_lookup_strategy(type_name, value)
|
811
|
+
is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
|
812
|
+
subtype_names = self._get_schema_subtypes(type_name) if is_ref_lookup_subtypes else None
|
813
|
+
is_resolved, resolved_uuid = self._ref_exists_internally(type_name, value, subtype_names)
|
814
|
+
if is_resolved:
|
815
|
+
resolved = [{"type": type_name, "uuid": resolved_uuid}]
|
816
|
+
self._cache_ref(type_name, value, resolved, subtype_names)
|
817
|
+
return resolved
|
798
818
|
self._ref_exists_cache_hit_count += 1
|
799
819
|
return resolved
|
800
820
|
# Not cached here.
|
801
821
|
self._ref_exists_cache_miss_count += 1
|
802
|
-
|
822
|
+
# Get the lookup strategy.
|
803
823
|
ref_lookup_strategy = self._ref_lookup_strategy(type_name, value)
|
804
824
|
is_ref_lookup_root = StructuredDataSet._is_ref_lookup_root(ref_lookup_strategy)
|
805
825
|
is_ref_lookup_root_first = StructuredDataSet._is_ref_lookup_root_first(ref_lookup_strategy)
|
806
826
|
is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
|
807
|
-
|
808
|
-
|
827
|
+
subtype_names = self._get_schema_subtypes(type_name) if is_ref_lookup_subtypes else None
|
828
|
+
# Lookup internally first (including at subtypes if desired).
|
829
|
+
is_resolved, resolved_uuid = self._ref_exists_internally(type_name, value, subtype_names)
|
830
|
+
if is_resolved:
|
831
|
+
resolved = [{"type": type_name, "uuid": resolved_uuid}]
|
832
|
+
self._cache_ref(type_name, value, resolved, subtype_names)
|
833
|
+
return resolved
|
834
|
+
# Not found internally; perform actual portal lookup (included at root and subtypes if desired).
|
835
|
+
# First construct the list of lookup paths at which to look for the referenced item.
|
836
|
+
lookup_paths = []
|
809
837
|
if is_ref_lookup_root_first:
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
838
|
+
lookup_paths.append(f"/{value}")
|
839
|
+
lookup_paths.append(f"/{type_name}/{value}")
|
840
|
+
if is_ref_lookup_root and not is_ref_lookup_root_first:
|
841
|
+
lookup_paths.append(f"/{value}")
|
842
|
+
if subtype_names:
|
843
|
+
for subtype_name in subtype_names:
|
844
|
+
lookup_paths.append(f"/{subtype_name}/{value}")
|
845
|
+
# Do the actual lookup in the portal for each of the desired lookup paths.
|
846
|
+
for lookup_path in lookup_paths:
|
847
|
+
if isinstance(item := self.get_metadata(lookup_path), dict):
|
848
|
+
resolved = [{"type": type_name, "uuid": item.get("uuid", None)}]
|
849
|
+
self._cache_ref(type_name, value, resolved, subtype_names)
|
850
|
+
return resolved
|
851
|
+
return []
|
852
|
+
|
853
|
+
def _ref_exists_internally(self, type_name: str, value: str,
|
854
|
+
subtype_names: Optional[List[str]] = None) -> Tuple[bool, Optional[str]]:
|
855
|
+
is_resolved, resolved_uuid = self._ref_exists_single_internally(type_name, value)
|
815
856
|
if is_resolved:
|
816
|
-
|
817
|
-
|
818
|
-
elif subtype_names and is_ref_lookup_subtypes:
|
857
|
+
return True, resolved_uuid
|
858
|
+
if subtype_names:
|
819
859
|
for subtype_name in subtype_names:
|
820
|
-
is_resolved, resolved_uuid = self.
|
860
|
+
is_resolved, resolved_uuid = self._ref_exists_single_internally(subtype_name, value)
|
821
861
|
if is_resolved:
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
# but TODO - do not think we should do this anymore - maybe test changes needed).
|
827
|
-
self._cache_ref(type_name, value, resolved, subtype_names)
|
828
|
-
return resolved
|
829
|
-
|
830
|
-
def _ref_exists_single(self, type_name: str, value: str, root: bool = False) -> Tuple[bool, Optional[str]]:
|
831
|
-
# Check first in our own data (i.e. e.g. within the given spreadsheet).
|
862
|
+
return True, resolved_uuid
|
863
|
+
return False, None
|
864
|
+
|
865
|
+
def _ref_exists_single_internally(self, type_name: str, value: str) -> Tuple[bool, Optional[str]]:
|
832
866
|
if self._data and (items := self._data.get(type_name)) and (schema := self.get_schema(type_name)):
|
833
|
-
|
867
|
+
identifying_properties = set(schema.get("identifyingProperties", [])) | {"identifier", "uuid"}
|
834
868
|
for item in items:
|
835
|
-
|
836
|
-
if
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
return
|
869
|
+
for identifying_property in identifying_properties:
|
870
|
+
if (identifying_value := item.get(identifying_property, None)) is not None:
|
871
|
+
if ((identifying_value == value) or
|
872
|
+
(isinstance(identifying_value, list) and (value in identifying_value))): # noqa
|
873
|
+
self._ref_exists_internal_count += 1
|
874
|
+
return True, item.get("uuid", None)
|
875
|
+
return False, None
|
842
876
|
|
843
877
|
@property
|
844
878
|
def ref_lookup_cache_hit_count(self) -> int:
|
879
|
+
if self._ref_cache is None:
|
880
|
+
return 0
|
845
881
|
try:
|
846
882
|
return self.get_metadata_cache.cache_info().hits
|
847
883
|
except Exception:
|
@@ -849,11 +885,17 @@ class Portal(PortalBase):
|
|
849
885
|
|
850
886
|
@property
|
851
887
|
def ref_lookup_cache_miss_count(self) -> int:
|
888
|
+
if self._ref_cache is None:
|
889
|
+
return self.ref_lookup_count
|
852
890
|
try:
|
853
891
|
return self.get_metadata_cache.cache_info().misses
|
854
892
|
except Exception:
|
855
893
|
return -1
|
856
894
|
|
895
|
+
@property
|
896
|
+
def ref_lookup_count(self) -> int:
|
897
|
+
return self._ref_lookup_found_count + self._ref_lookup_notfound_count + self._ref_lookup_error_count
|
898
|
+
|
857
899
|
@property
|
858
900
|
def ref_lookup_found_count(self) -> int:
|
859
901
|
return self._ref_lookup_found_count
|
@@ -62,15 +62,15 @@ dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19
|
|
62
62
|
dcicutils/sheet_utils.py,sha256=VlmzteONW5VF_Q4vo0yA5vesz1ViUah1MZ_yA1rwZ0M,33629
|
63
63
|
dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
|
64
64
|
dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
|
65
|
-
dcicutils/structured_data.py,sha256=
|
65
|
+
dcicutils/structured_data.py,sha256=7JDesiA0geGkP343yV3z9Bkc8qN22RKoT20cHrecEYA,47985
|
66
66
|
dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
|
67
67
|
dcicutils/tmpfile_utils.py,sha256=n95XF8dZVbQRSXBZTGToXXfSs3JUVRyN6c3ZZ0nhAWI,1403
|
68
68
|
dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
|
69
69
|
dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
|
70
70
|
dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
|
71
71
|
dcicutils/zip_utils.py,sha256=rnjNv_k6L9jT2SjDSgVXp4BEJYLtz9XN6Cl2Fy-tqnM,2027
|
72
|
-
dcicutils-8.8.0.
|
73
|
-
dcicutils-8.8.0.
|
74
|
-
dcicutils-8.8.0.
|
75
|
-
dcicutils-8.8.0.
|
76
|
-
dcicutils-8.8.0.
|
72
|
+
dcicutils-8.8.0.1b6.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
|
73
|
+
dcicutils-8.8.0.1b6.dist-info/METADATA,sha256=-MVcTLgcFRea1f0P8L91J8zmo1wbjbUbPr-V82goavo,3356
|
74
|
+
dcicutils-8.8.0.1b6.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
75
|
+
dcicutils-8.8.0.1b6.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
|
76
|
+
dcicutils-8.8.0.1b6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|