dcicutils 8.8.0.1b4__py3-none-any.whl → 8.8.0.1b6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ from dcicutils.common import OrchestratedApp
12
12
  from dcicutils.data_readers import CsvReader, Excel, RowReader
13
13
  from dcicutils.datetime_utils import normalize_date_string, normalize_datetime_string
14
14
  from dcicutils.file_utils import search_for_file
15
- from dcicutils.misc_utils import (create_dict, create_readonly_object, is_uuid, load_json_if,
15
+ from dcicutils.misc_utils import (create_dict, create_readonly_object, load_json_if,
16
16
  merge_objects, remove_empty_properties, right_trim,
17
17
  split_string, to_boolean, to_enum, to_float, to_integer, VirtualApp)
18
18
  from dcicutils.portal_object_utils import PortalObject
@@ -53,6 +53,10 @@ class StructuredDataSet:
53
53
  # can choose to lookup root path first, or not lookup root path at all, or not lookup
54
54
  # subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
55
55
  # and value (string) arguements and return an integer of any of the below ORed together.
56
+ # The main purpose of this is optimization; to minimum portal lookups; since for example,
57
+ # currently at least, /{type}/{accession} does not work but /{accession} does; so we
58
+ # currently (smaht-portal/.../ingestion_processors) use REF_LOOKUP_ROOT_FIRST for this.
59
+ # And current usage NEVER has REF_LOOKUP_SUBTYPES turned OFF; but support just in case.
56
60
  REF_LOOKUP_ROOT = 0x0001
57
61
  REF_LOOKUP_ROOT_FIRST = 0x0002 | REF_LOOKUP_ROOT
58
62
  REF_LOOKUP_SUBTYPES = 0x0004
@@ -228,8 +232,10 @@ class StructuredDataSet:
228
232
  if ref_errors := self.ref_errors:
229
233
  ref_errors_actual = []
230
234
  for ref_error in ref_errors:
231
- if not self.portal.ref_exists(ref_error["error"]):
235
+ if not (resolved := self.portal.ref_exists(ref := ref_error["error"])):
232
236
  ref_errors_actual.append(ref_error)
237
+ else:
238
+ self._resolved_refs.add((ref, resolved[0].get("uuid")))
233
239
  if ref_errors_actual:
234
240
  self._errors["ref"] = ref_errors_actual
235
241
  else:
@@ -291,6 +297,10 @@ class StructuredDataSet:
291
297
  def ref_lookup_cache_miss_count(self) -> int:
292
298
  return self.portal.ref_lookup_cache_miss_count if self.portal else -1
293
299
 
300
+ @property
301
+ def ref_lookup_count(self) -> int:
302
+ return self.portal.ref_lookup_count if self.portal else -1
303
+
294
304
  @property
295
305
  def ref_lookup_found_count(self) -> int:
296
306
  return self.portal.ref_lookup_found_count if self.portal else -1
@@ -561,7 +571,7 @@ class Schema(SchemaBase):
561
571
  the names of any nested properties (i.e objects within objects) flattened into a single
562
572
  property name in dot notation; and set the value of each of these flat property names
563
573
  to the type of the terminal/leaf value of the (either) top-level or nested type. N.B. We
564
- do NOT currently support array-of-arry or array-of-multiple-types. E.g. for this schema:
574
+ do NOT currently support array-of-array or array-of-multiple-types. E.g. for this schema:
565
575
 
566
576
  { "properties": {
567
577
  "abc": {
@@ -779,69 +789,95 @@ class Portal(PortalBase):
779
789
  return self._ref_cache.get(f"/{type_name}/{value}", None)
780
790
  return None
781
791
 
782
- def _cache_ref(self, type_name: str, value: str, resolved: List[str],
783
- subtype_names: Optional[List[str]]) -> None:
792
+ def _cache_ref(self, type_name: str, value: str, resolved: List[str], subtype_names: Optional[List[str]]) -> None:
784
793
  if self._ref_cache is not None:
785
- for type_name in [type_name] + (subtype_names or []):
786
- object_path = f"/{type_name}/{value}"
787
- if self._ref_cache.get(object_path, None) is None:
788
- self._ref_cache[object_path] = resolved
794
+ for type_name in [type_name] + (subtype_names if subtype_names else []):
795
+ self._ref_cache[f"/{type_name}/{value}"] = resolved
789
796
 
790
797
  def ref_exists(self, type_name: str, value: Optional[str] = None) -> List[dict]:
791
798
  if not value:
792
799
  if type_name.startswith("/") and len(parts := type_name[1:].split("/")) == 2:
793
- type_name = parts[0]
794
- value = parts[1]
800
+ if not (type_name := parts[0]) or not (value := parts[1]):
801
+ return []
795
802
  else:
796
- return [] # Should not happen.
803
+ return []
797
804
  if (resolved := self._ref_exists_from_cache(type_name, value)) is not None:
805
+ # Found cached resolved reference.
806
+ if not resolved:
807
+ # Cached resolved reference is empty ([]).
808
+ # It might NOW be found internally, since the portal self._data can change.
809
+ # TODO
810
+ ref_lookup_strategy = self._ref_lookup_strategy(type_name, value)
811
+ is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
812
+ subtype_names = self._get_schema_subtypes(type_name) if is_ref_lookup_subtypes else None
813
+ is_resolved, resolved_uuid = self._ref_exists_internally(type_name, value, subtype_names)
814
+ if is_resolved:
815
+ resolved = [{"type": type_name, "uuid": resolved_uuid}]
816
+ self._cache_ref(type_name, value, resolved, subtype_names)
817
+ return resolved
798
818
  self._ref_exists_cache_hit_count += 1
799
819
  return resolved
800
820
  # Not cached here.
801
821
  self._ref_exists_cache_miss_count += 1
802
- resolved = []
822
+ # Get the lookup strategy.
803
823
  ref_lookup_strategy = self._ref_lookup_strategy(type_name, value)
804
824
  is_ref_lookup_root = StructuredDataSet._is_ref_lookup_root(ref_lookup_strategy)
805
825
  is_ref_lookup_root_first = StructuredDataSet._is_ref_lookup_root_first(ref_lookup_strategy)
806
826
  is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
807
- is_resolved = False
808
- subtype_names = self._get_schema_subtypes(type_name)
827
+ subtype_names = self._get_schema_subtypes(type_name) if is_ref_lookup_subtypes else None
828
+ # Lookup internally first (including at subtypes if desired).
829
+ is_resolved, resolved_uuid = self._ref_exists_internally(type_name, value, subtype_names)
830
+ if is_resolved:
831
+ resolved = [{"type": type_name, "uuid": resolved_uuid}]
832
+ self._cache_ref(type_name, value, resolved, subtype_names)
833
+ return resolved
834
+ # Not found internally; perform actual portal lookup (included at root and subtypes if desired).
835
+ # First construct the list of lookup paths at which to look for the referenced item.
836
+ lookup_paths = []
809
837
  if is_ref_lookup_root_first:
810
- is_resolved, resolved_uuid = self._ref_exists_single(type_name, value, root=True)
811
- if not is_resolved:
812
- is_resolved, resolved_uuid = self._ref_exists_single(type_name, value)
813
- if not is_resolved and is_ref_lookup_root and not is_ref_lookup_root_first:
814
- is_resolved, resolved_uuid = self._ref_exists_single(type_name, value, root=True)
838
+ lookup_paths.append(f"/{value}")
839
+ lookup_paths.append(f"/{type_name}/{value}")
840
+ if is_ref_lookup_root and not is_ref_lookup_root_first:
841
+ lookup_paths.append(f"/{value}")
842
+ if subtype_names:
843
+ for subtype_name in subtype_names:
844
+ lookup_paths.append(f"/{subtype_name}/{value}")
845
+ # Do the actual lookup in the portal for each of the desired lookup paths.
846
+ for lookup_path in lookup_paths:
847
+ if isinstance(item := self.get_metadata(lookup_path), dict):
848
+ resolved = [{"type": type_name, "uuid": item.get("uuid", None)}]
849
+ self._cache_ref(type_name, value, resolved, subtype_names)
850
+ return resolved
851
+ return []
852
+
853
+ def _ref_exists_internally(self, type_name: str, value: str,
854
+ subtype_names: Optional[List[str]] = None) -> Tuple[bool, Optional[str]]:
855
+ is_resolved, resolved_uuid = self._ref_exists_single_internally(type_name, value)
815
856
  if is_resolved:
816
- resolved.append({"type": type_name, "uuid": resolved_uuid})
817
- # Check for the given ref in all subtypes of the given type.
818
- elif subtype_names and is_ref_lookup_subtypes:
857
+ return True, resolved_uuid
858
+ if subtype_names:
819
859
  for subtype_name in subtype_names:
820
- is_resolved, resolved_uuid = self._ref_exists_single(subtype_name, value)
860
+ is_resolved, resolved_uuid = self._ref_exists_single_internally(subtype_name, value)
821
861
  if is_resolved:
822
- resolved.append({"type": type_name, "uuid": resolved_uuid})
823
- break
824
- # Cache this ref (and all subtype versions of it); whether or not found;
825
- # if not found it will be an empty array (array because caching all matches;
826
- # but TODO - do not think we should do this anymore - maybe test changes needed).
827
- self._cache_ref(type_name, value, resolved, subtype_names)
828
- return resolved
829
-
830
- def _ref_exists_single(self, type_name: str, value: str, root: bool = False) -> Tuple[bool, Optional[str]]:
831
- # Check first in our own data (i.e. e.g. within the given spreadsheet).
862
+ return True, resolved_uuid
863
+ return False, None
864
+
865
+ def _ref_exists_single_internally(self, type_name: str, value: str) -> Tuple[bool, Optional[str]]:
832
866
  if self._data and (items := self._data.get(type_name)) and (schema := self.get_schema(type_name)):
833
- iproperties = set(schema.get("identifyingProperties", [])) | {"identifier", "uuid"}
867
+ identifying_properties = set(schema.get("identifyingProperties", [])) | {"identifier", "uuid"}
834
868
  for item in items:
835
- if (ivalue := next((item[iproperty] for iproperty in iproperties if iproperty in item), None)):
836
- if isinstance(ivalue, list) and value in ivalue or ivalue == value:
837
- self._ref_exists_internal_count += 1
838
- return True, (ivalue if isinstance(ivalue, str) and is_uuid(ivalue) else None)
839
- if (value := self.get_metadata(f"/{type_name}/{value}" if not root else f"/{value}")) is None:
840
- return False, None
841
- return True, value.get("uuid")
869
+ for identifying_property in identifying_properties:
870
+ if (identifying_value := item.get(identifying_property, None)) is not None:
871
+ if ((identifying_value == value) or
872
+ (isinstance(identifying_value, list) and (value in identifying_value))): # noqa
873
+ self._ref_exists_internal_count += 1
874
+ return True, item.get("uuid", None)
875
+ return False, None
842
876
 
843
877
  @property
844
878
  def ref_lookup_cache_hit_count(self) -> int:
879
+ if self._ref_cache is None:
880
+ return 0
845
881
  try:
846
882
  return self.get_metadata_cache.cache_info().hits
847
883
  except Exception:
@@ -849,11 +885,17 @@ class Portal(PortalBase):
849
885
 
850
886
  @property
851
887
  def ref_lookup_cache_miss_count(self) -> int:
888
+ if self._ref_cache is None:
889
+ return self.ref_lookup_count
852
890
  try:
853
891
  return self.get_metadata_cache.cache_info().misses
854
892
  except Exception:
855
893
  return -1
856
894
 
895
+ @property
896
+ def ref_lookup_count(self) -> int:
897
+ return self._ref_lookup_found_count + self._ref_lookup_notfound_count + self._ref_lookup_error_count
898
+
857
899
  @property
858
900
  def ref_lookup_found_count(self) -> int:
859
901
  return self._ref_lookup_found_count
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dcicutils
3
- Version: 8.8.0.1b4
3
+ Version: 8.8.0.1b6
4
4
  Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
5
5
  Home-page: https://github.com/4dn-dcic/utils
6
6
  License: MIT
@@ -62,15 +62,15 @@ dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19
62
62
  dcicutils/sheet_utils.py,sha256=VlmzteONW5VF_Q4vo0yA5vesz1ViUah1MZ_yA1rwZ0M,33629
63
63
  dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
64
64
  dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
65
- dcicutils/structured_data.py,sha256=zkUoD4xAsmJ_6CyrzLv0TaHBslrd08ufHoi8IP3R6DU,45715
65
+ dcicutils/structured_data.py,sha256=7JDesiA0geGkP343yV3z9Bkc8qN22RKoT20cHrecEYA,47985
66
66
  dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
67
67
  dcicutils/tmpfile_utils.py,sha256=n95XF8dZVbQRSXBZTGToXXfSs3JUVRyN6c3ZZ0nhAWI,1403
68
68
  dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
69
69
  dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
70
70
  dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
71
71
  dcicutils/zip_utils.py,sha256=rnjNv_k6L9jT2SjDSgVXp4BEJYLtz9XN6Cl2Fy-tqnM,2027
72
- dcicutils-8.8.0.1b4.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
73
- dcicutils-8.8.0.1b4.dist-info/METADATA,sha256=n0Bbllxdjglkx2L-vXZhvFsfu7CsFy2UVUKHgHZd7JU,3356
74
- dcicutils-8.8.0.1b4.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
75
- dcicutils-8.8.0.1b4.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
76
- dcicutils-8.8.0.1b4.dist-info/RECORD,,
72
+ dcicutils-8.8.0.1b6.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
73
+ dcicutils-8.8.0.1b6.dist-info/METADATA,sha256=-MVcTLgcFRea1f0P8L91J8zmo1wbjbUbPr-V82goavo,3356
74
+ dcicutils-8.8.0.1b6.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
75
+ dcicutils-8.8.0.1b6.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
76
+ dcicutils-8.8.0.1b6.dist-info/RECORD,,