dcicutils 8.8.0.1b22__py3-none-any.whl → 8.8.0.1b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  from copy import deepcopy
2
2
  from functools import lru_cache
3
3
  import re
4
- from typing import Any, List, Optional, Tuple, Type, Union
4
+ from typing import Any, Callable, List, Optional, Tuple, Type, Union
5
5
  from dcicutils.data_readers import RowReader
6
6
  from dcicutils.misc_utils import create_readonly_object
7
7
  from dcicutils.portal_utils import Portal
@@ -19,7 +19,7 @@ class PortalObject:
19
19
  self._data = data if isinstance(data, dict) else {}
20
20
  self._portal = portal if isinstance(portal, Portal) else None
21
21
  self._schema = schema if isinstance(schema, dict) else (schema.data if isinstance(schema, Schema) else None)
22
- self._type = type if isinstance(type, str) and type else None
22
+ self._type = type if isinstance(type, str) else ""
23
23
 
24
24
  @property
25
25
  def data(self) -> dict:
@@ -31,8 +31,8 @@ class PortalObject:
31
31
 
32
32
  @property
33
33
  @lru_cache(maxsize=1)
34
- def type(self) -> Optional[str]:
35
- return self._type or Portal.get_schema_type(self._data) or (Schema(self._schema).type if self._schema else None)
34
+ def type(self) -> str:
35
+ return self._type or Portal.get_schema_type(self._data) or (Schema(self._schema).type if self._schema else "")
36
36
 
37
37
  @property
38
38
  @lru_cache(maxsize=1)
@@ -75,86 +75,42 @@ class PortalObject:
75
75
  identifying_properties.append("aliases")
76
76
  return identifying_properties or None
77
77
 
78
- @property
79
- @lru_cache(maxsize=1)
80
- def identifying_paths(self) -> Optional[List[str]]:
81
- """
82
- Returns a list of the possible Portal URL paths identifying this Portal object.
83
- """
84
- identifying_paths = []
85
- if not (identifying_properties := self.identifying_properties):
86
- if self.uuid:
87
- if self.type:
88
- identifying_paths.append(f"/{self.type}/{self.uuid}")
89
- identifying_paths.append(f"/{self.uuid}")
90
- return identifying_paths
91
- for identifying_property in identifying_properties:
92
- if (identifying_value := self._data.get(identifying_property)):
93
- if identifying_property == "uuid":
94
- identifying_paths.append(f"/{self.type}/{identifying_value}")
95
- identifying_paths.append(f"/{identifying_value}")
96
- # For now at least we include the path both with and without the schema type component,
97
- # as for some identifying values, it works (only) with, and some, it works (only) without.
98
- # For example: If we have FileSet with "accession", an identifying property, with value
99
- # SMAFSFXF1RO4 then /SMAFSFXF1RO4 works but /FileSet/SMAFSFXF1RO4 does not; and
100
- # conversely using "submitted_id", also an identifying property, with value
101
- # UW_FILE-SET_COLO-829BL_HI-C_1 then /UW_FILE-SET_COLO-829BL_HI-C_1 does
102
- # not work but /FileSet/UW_FILE-SET_COLO-829BL_HI-C_1 does work.
103
- elif isinstance(identifying_value, list):
104
- for identifying_value_item in identifying_value:
105
- if self.type:
106
- identifying_paths.append(f"/{self.type}/{identifying_value_item}")
107
- identifying_paths.append(f"/{identifying_value_item}")
108
- else:
109
- if (schema := self.schema):
110
- if pattern := schema.get("properties", {}).get(identifying_property, {}).get("pattern"):
111
- if not re.match(pattern, identifying_value):
112
- # If this identifying value is for a (identifying) property which has a
113
- # pattern, and the value does NOT match the pattern, then do NOT include
114
- # this value as an identifying path, since it cannot possibly be found.
115
- continue
116
- if self.type:
117
- identifying_paths.append(f"/{self.type}/{identifying_value}")
118
- identifying_paths.append(f"/{identifying_value}")
119
- return identifying_paths or None
120
-
121
- @property
122
- @lru_cache(maxsize=1)
123
- def identifying_path(self) -> Optional[str]:
124
- if identifying_paths := self.identifying_paths:
125
- return identifying_paths[0]
126
-
127
- def lookup(self, include_identifying_path: bool = False,
128
- raw: bool = False) -> Optional[Union[Tuple[PortalObject, str], PortalObject]]:
129
- return self._lookup(raw=raw) if include_identifying_path else self._lookup(raw=raw)[0]
130
-
131
- def lookup_identifying_path(self) -> Optional[str]:
132
- return self._lookup()[1]
133
-
134
- def _lookup(self, raw: bool = False) -> Tuple[Optional[PortalObject], Optional[str]]:
78
+ @lru_cache(maxsize=8192)
79
+ def lookup(self, raw: bool = False,
80
+ ref_lookup_strategy: Optional[Callable] = None) -> Tuple[Optional[PortalObject], Optional[str], int]:
81
+ nlookups = 0
82
+ first_identifying_path = None
135
83
  try:
136
- if identifying_paths := self.identifying_paths:
84
+ if identifying_paths := self._get_identifying_paths(ref_lookup_strategy=ref_lookup_strategy):
137
85
  for identifying_path in identifying_paths:
86
+ if not first_identifying_path:
87
+ first_identifying_path = identifying_path
88
+ nlookups += 1
138
89
  if (value := self._portal.get(identifying_path, raw=raw)) and (value.status_code == 200):
139
- return PortalObject(value.json(),
140
- portal=self._portal, type=self.type if raw else None), identifying_path
90
+ return (
91
+ PortalObject(value.json(), portal=self._portal, type=self.type if raw else None),
92
+ identifying_path,
93
+ nlookups
94
+ )
141
95
  except Exception:
142
96
  pass
143
- return None, self.identifying_path
97
+ return None, first_identifying_path, nlookups
144
98
 
145
99
  def compare(self, value: Union[dict, PortalObject],
146
- consider_refs: bool = False, resolved_refs: List[dict] = None) -> dict:
100
+ consider_refs: bool = False, resolved_refs: List[dict] = None) -> Tuple[dict, int]:
147
101
  if consider_refs and isinstance(resolved_refs, list):
148
- this_data = self.normalized_refs(refs=resolved_refs).data
102
+ normlized_portal_object, nlookups = self._normalized_refs(refs=resolved_refs)
103
+ this_data = normlized_portal_object.data
149
104
  else:
150
105
  this_data = self.data
106
+ nlookups = 0
151
107
  if isinstance(value, PortalObject):
152
108
  comparing_data = value.data
153
109
  elif isinstance(value, dict):
154
110
  comparing_data = value
155
111
  else:
156
- return {}
157
- return PortalObject._compare(this_data, comparing_data)
112
+ return {}, nlookups
113
+ return PortalObject._compare(this_data, comparing_data), nlookups
158
114
 
159
115
  @staticmethod
160
116
  def _compare(a: Any, b: Any, _path: Optional[str] = None) -> dict:
@@ -201,42 +157,106 @@ class PortalObject:
201
157
  diffs[_path] = diff_updating(a, b)
202
158
  return diffs
203
159
 
204
- def normalize_refs(self, refs: List[dict]) -> None:
160
+ @lru_cache(maxsize=1)
161
+ def _get_identifying_paths(self, ref_lookup_strategy: Optional[Callable] = None) -> Optional[List[str]]:
205
162
  """
206
- Turns any (linkTo) references which are paths (e.g. /SubmissionCenter/uwsc_gcc) within
207
- this Portal object into the uuid style reference (e.g. d1b67068-300f-483f-bfe8-63d23c93801f),
208
- based on the given "refs" list which is assumed to be a list of dictionaries, where each
209
- contains a "path" and a "uuid" property; this list is typically (for our first usage of
210
- this function) the value of structured_data.StructuredDataSet.resolved_refs_with_uuid.
211
- Changes are made to this Portal object in place; use normalized_refs function to make a copy.
212
- If there are no "refs" (None or empty) or if the speicified reference is not found in this
213
- list then the references will be looked up via Portal calls (via Portal.get_metadata).
163
+ Returns a list of the possible Portal URL paths identifying this Portal object.
214
164
  """
215
- PortalObject._normalize_refs(self.data, refs=refs, schema=self.schema, portal=self.portal)
165
+ identifying_paths = []
166
+ if not (identifying_properties := self.identifying_properties):
167
+ if self.uuid:
168
+ if self.type:
169
+ identifying_paths.append(f"/{self.type}/{self.uuid}")
170
+ identifying_paths.append(f"/{self.uuid}")
171
+ return identifying_paths
172
+ for identifying_property in identifying_properties:
173
+ if identifying_value := self._data.get(identifying_property):
174
+ if identifying_property == "uuid":
175
+ if self.type:
176
+ identifying_paths.append(f"/{self.type}/{identifying_value}")
177
+ identifying_paths.append(f"/{identifying_value}")
178
+ # For now at least we include the path both with and without the schema type component,
179
+ # as for some identifying values, it works (only) with, and some, it works (only) without.
180
+ # For example: If we have FileSet with "accession", an identifying property, with value
181
+ # SMAFSFXF1RO4 then /SMAFSFXF1RO4 works but /FileSet/SMAFSFXF1RO4 does not; and
182
+ # conversely using "submitted_id", also an identifying property, with value
183
+ # UW_FILE-SET_COLO-829BL_HI-C_1 then /UW_FILE-SET_COLO-829BL_HI-C_1 does
184
+ # not work but /FileSet/UW_FILE-SET_COLO-829BL_HI-C_1 does work.
185
+ elif isinstance(identifying_value, list):
186
+ for identifying_value_item in identifying_value:
187
+ if self.type:
188
+ identifying_paths.append(f"/{self.type}/{identifying_value_item}")
189
+ identifying_paths.append(f"/{identifying_value_item}")
190
+ else:
191
+ # TODO: Import from somewhere ...
192
+ lookup_options = 0
193
+ if schema := self.schema:
194
+ # TODO: Hook into the ref_lookup_strategy thing in structured_data to make
195
+ # sure we check accession format (since it does not have a pattern).
196
+ if callable(ref_lookup_strategy):
197
+ lookup_options, ref_validator = ref_lookup_strategy(
198
+ self.type, schema, identifying_value)
199
+ if callable(ref_validator):
200
+ if ref_validator(schema, identifying_property, identifying_value) is False:
201
+ continue
202
+ if pattern := schema.get("properties", {}).get(identifying_property, {}).get("pattern"):
203
+ if not re.match(pattern, identifying_value):
204
+ # If this identifying value is for a (identifying) property which has a
205
+ # pattern, and the value does NOT match the pattern, then do NOT include
206
+ # this value as an identifying path, since it cannot possibly be found.
207
+ continue
208
+ if not lookup_options:
209
+ lookup_options = Portal.LOOKUP_DEFAULT
210
+ if Portal.is_lookup_root_first(lookup_options):
211
+ identifying_paths.append(f"/{identifying_value}")
212
+ if Portal.is_lookup_specified_type(lookup_options) and self.type:
213
+ identifying_paths.append(f"/{self.type}/{identifying_value}")
214
+ if Portal.is_lookup_root(lookup_options) and not Portal.is_lookup_root_first(lookup_options):
215
+ identifying_paths.append(f"/{identifying_value}")
216
+ if Portal.is_lookup_subtypes(lookup_options):
217
+ for subtype_name in self._portal.get_schema_subtype_names(self.type):
218
+ identifying_paths.append(f"/{subtype_name}/{identifying_value}")
219
+ return identifying_paths or None
216
220
 
217
- def normalized_refs(self, refs: List[dict]) -> PortalObject:
221
+ def _normalized_refs(self, refs: List[dict]) -> Tuple[PortalObject, int]:
218
222
  """
219
- Same as normalize_ref but does not make this change to this Portal object in place,
223
+ Same as _normalize_ref but does NOT make this change to this Portal object IN PLACE,
220
224
  rather it returns a new instance of this Portal object wrapped in a new PortalObject.
221
225
  """
222
226
  portal_object = self.copy()
223
- portal_object.normalize_refs(refs)
224
- return portal_object
227
+ nlookups = portal_object._normalize_refs(refs)
228
+ return portal_object, nlookups
229
+
230
+ def _normalize_refs(self, refs: List[dict]) -> int:
231
+ """
232
+ Turns any (linkTo) references which are paths (e.g. /SubmissionCenter/uwsc_gcc) within this
233
+ object IN PLACE into the uuid style reference (e.g. d1b67068-300f-483f-bfe8-63d23c93801f),
234
+ based on the given "refs" list which is assumed to be a list of dictionaries, where each
235
+ contains a "path" and a "uuid" property; this list is typically (for our first usage of
236
+ this function) the value of structured_data.StructuredDataSet.resolved_refs_with_uuid.
237
+ Changes are made to this Portal object IN PLACE; use _normalized_refs function to make a copy.
238
+ If there are no "refs" (None or empty) or if the speicified reference is not found in this
239
+ list then the references will be looked up via Portal calls (via Portal.get_metadata).
240
+ """
241
+ _, nlookups = PortalObject._normalize_data_refs(self.data, refs=refs, schema=self.schema, portal=self.portal)
242
+ return nlookups
225
243
 
226
244
  @staticmethod
227
- def _normalize_refs(value: Any, refs: List[dict], schema: dict, portal: Portal, _path: Optional[str] = None) -> Any:
245
+ def _normalize_data_refs(value: Any, refs: List[dict], schema: dict,
246
+ portal: Portal, _path: Optional[str] = None) -> Tuple[Any, int]:
247
+ nlookups = 0
228
248
  if not value or not isinstance(schema, dict):
229
- return value
249
+ return value, nlookups
230
250
  if isinstance(value, dict):
231
251
  for key in value:
232
252
  path = f"{_path}.{key}" if _path else key
233
- value[key] = PortalObject._normalize_refs(value[key], refs=refs,
234
- schema=schema, portal=portal, _path=path)
253
+ value[key], nlookups = PortalObject._normalize_data_refs(value[key], refs=refs,
254
+ schema=schema, portal=portal, _path=path)
235
255
  elif isinstance(value, list):
236
256
  for index in range(len(value)):
237
257
  path = f"{_path or ''}#{index}"
238
- value[index] = PortalObject._normalize_refs(value[index], refs=refs,
239
- schema=schema, portal=portal, _path=path)
258
+ value[index], nlookups = PortalObject._normalize_data_refs(value[index], refs=refs,
259
+ schema=schema, portal=portal, _path=path)
240
260
  elif value_type := Schema.get_property_by_path(schema, _path):
241
261
  if link_to := value_type.get("linkTo"):
242
262
  ref_path = f"/{link_to}/{value}"
@@ -247,7 +267,7 @@ class PortalObject:
247
267
  else:
248
268
  ref_uuid = None
249
269
  if ref_uuid:
250
- return ref_uuid
270
+ return ref_uuid, nlookups
251
271
  # Here our (linkTo) reference appears not to be in the given refs; if these refs came
252
272
  # from structured_data.StructuredDataSet.resolved_refs_with_uuid (in the context of
253
273
  # smaht-submitr, which is the typical/first use case for this function) then this could
@@ -255,6 +275,7 @@ class PortalObject:
255
275
  # the data/spreadsheet being submitted. In any case, we don't have the associated uuid
256
276
  # so let us look it up here.
257
277
  if isinstance(portal, Portal):
278
+ nlookups += 1
258
279
  if (ref_object := portal.get_metadata(ref_path)) and (ref_uuid := ref_object.get("uuid")):
259
- return ref_uuid
260
- return value
280
+ return ref_uuid, nlookups
281
+ return value, nlookups
dcicutils/portal_utils.py CHANGED
@@ -46,6 +46,22 @@ class Portal:
46
46
  KEYS_FILE_DIRECTORY = "~"
47
47
  MIME_TYPE_JSON = "application/json"
48
48
 
49
+ # Object lookup strategies; on a per-reference (type/value) basis, used currently ONLY by
50
+ # structured_data.py; controlled by an optional ref_lookup_strategy callable; default is
51
+ # lookup at root path but after the specified type path lookup, and then lookup all subtypes;
52
+ # can choose to lookup root path first, or not lookup root path at all, or not lookup
53
+ # subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
54
+ # and value (string) arguements and return an integer of any of the below ORed together.
55
+ # The main purpose of this is optimization; to minimize portal lookups; since for example,
56
+ # currently at least, /{type}/{accession} does not work but /{accession} does; so we
57
+ # currently (smaht-portal/.../ingestion_processors) use LOOKUP_ROOT_FIRST for this.
58
+ # And current usage NEVER has LOOKUP_SUBTYPES turned OFF; but support just in case.
59
+ LOOKUP_SPECIFIED_TYPE = 0x0001
60
+ LOOKUP_ROOT = 0x0002
61
+ LOOKUP_ROOT_FIRST = 0x0004 | LOOKUP_ROOT
62
+ LOOKUP_SUBTYPES = 0x0008
63
+ LOOKUP_DEFAULT = LOOKUP_SPECIFIED_TYPE | LOOKUP_ROOT | LOOKUP_SUBTYPES
64
+
49
65
  def __init__(self,
50
66
  arg: Optional[Union[Portal, TestApp, VirtualApp, PyramidRouter, dict, tuple, str]] = None,
51
67
  env: Optional[str] = None, server: Optional[str] = None,
@@ -188,9 +204,27 @@ class Portal:
188
204
  def vapp(self) -> Optional[TestApp]:
189
205
  return self._vapp
190
206
 
207
+ @staticmethod
208
+ def is_lookup_specified_type(lookup_options: int) -> bool:
209
+ return (lookup_options &
210
+ Portal.LOOKUP_SPECIFIED_TYPE) == Portal.LOOKUP_SPECIFIED_TYPE
211
+
212
+ @staticmethod
213
+ def is_lookup_root(lookup_options: int) -> bool:
214
+ return (lookup_options & Portal.LOOKUP_ROOT) == Portal.LOOKUP_ROOT
215
+
216
+ @staticmethod
217
+ def is_lookup_root_first(lookup_options: int) -> bool:
218
+ return (lookup_options & Portal.LOOKUP_ROOT_FIRST) == Portal.LOOKUP_ROOT_FIRST
219
+
220
+ @staticmethod
221
+ def is_lookup_subtypes(lookup_options: int) -> bool:
222
+ return (lookup_options & Portal.LOOKUP_SUBTYPES) == Portal.LOOKUP_SUBTYPES
223
+
191
224
  def get(self, url: str, follow: bool = True,
192
225
  raw: bool = False, database: bool = False, raise_for_status: bool = False, **kwargs) -> OptionalResponse:
193
226
  url = self.url(url, raw, database)
227
+ # print(f'xyzzy.portal.get({url})')
194
228
  if not self.vapp:
195
229
  response = requests.get(url, allow_redirects=follow, **self._kwargs(**kwargs))
196
230
  else:
@@ -205,6 +239,7 @@ class Portal:
205
239
  def patch(self, url: str, data: Optional[dict] = None, json: Optional[dict] = None,
206
240
  raise_for_status: bool = False, **kwargs) -> OptionalResponse:
207
241
  url = self.url(url)
242
+ # print(f'xyzzy.portal.patch({url})')
208
243
  if not self.vapp:
209
244
  response = requests.patch(url, data=data, json=json, **self._kwargs(**kwargs))
210
245
  else:
@@ -217,6 +252,7 @@ class Portal:
217
252
  def post(self, url: str, data: Optional[dict] = None, json: Optional[dict] = None, files: Optional[dict] = None,
218
253
  raise_for_status: bool = False, **kwargs) -> OptionalResponse:
219
254
  url = self.url(url)
255
+ # print(f'xyzzy.portal.post({url})')
220
256
  if files and not ("headers" in kwargs):
221
257
  # Setting headers to None when using files implies content-type multipart/form-data.
222
258
  kwargs["headers"] = None
@@ -233,6 +269,7 @@ class Portal:
233
269
  return response
234
270
 
235
271
  def get_metadata(self, object_id: str, raw: bool = False, database: bool = False) -> Optional[dict]:
272
+ # print(f'xyzzy.portal.get_metadata({object_id})')
236
273
  if isinstance(raw, bool) and raw:
237
274
  add_on = "frame=raw" + ("&datastore=database" if isinstance(database, bool) and database else "")
238
275
  elif database:
@@ -242,11 +279,13 @@ class Portal:
242
279
  return get_metadata(obj_id=object_id, vapp=self.vapp, key=self.key, add_on=add_on)
243
280
 
244
281
  def patch_metadata(self, object_id: str, data: dict) -> Optional[dict]:
282
+ # print(f'xyzzy.portal.patch_metadata({object_id})')
245
283
  if self.key:
246
284
  return patch_metadata(obj_id=object_id, patch_item=data, key=self.key)
247
285
  return self.patch(f"/{object_id}", data).json()
248
286
 
249
287
  def post_metadata(self, object_type: str, data: dict) -> Optional[dict]:
288
+ # print(f'xyzzy.portal.post_metadata({object_id})')
250
289
  if self.key:
251
290
  return post_metadata(schema_name=object_type, post_item=data, key=self.key)
252
291
  return self.post(f"/{object_type}", data).json()
@@ -358,6 +397,12 @@ class Portal:
358
397
  super_type_map_flattened[super_type_name] = list_breadth_first(super_type_map, super_type_name)
359
398
  return super_type_map_flattened
360
399
 
400
+ @lru_cache(maxsize=64)
401
+ def get_schema_subtype_names(self, type_name: str) -> List[str]:
402
+ if not (schemas_super_type_map := self.get_schemas_super_type_map()):
403
+ return []
404
+ return schemas_super_type_map.get(type_name, [])
405
+
361
406
  def url(self, url: str, raw: bool = False, database: bool = False) -> str:
362
407
  if not isinstance(url, str) or not url:
363
408
  return "/"
@@ -48,22 +48,6 @@ StructuredDataSet = Type["StructuredDataSet"]
48
48
 
49
49
  class StructuredDataSet:
50
50
 
51
- # Reference (linkTo) lookup strategies; on a per-reference (type/value) basis;
52
- # controlled by optional ref_lookup_strategy callable; default is lookup at root path
53
- # but after the named reference (linkTo) type path lookup, and then lookup all subtypes;
54
- # can choose to lookup root path first, or not lookup root path at all, or not lookup
55
- # subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
56
- # and value (string) arguements and return an integer of any of the below ORed together.
57
- # The main purpose of this is optimization; to minimize portal lookups; since for example,
58
- # currently at least, /{type}/{accession} does not work but /{accession} does; so we
59
- # currently (smaht-portal/.../ingestion_processors) use REF_LOOKUP_ROOT_FIRST for this.
60
- # And current usage NEVER has REF_LOOKUP_SUBTYPES turned OFF; but support just in case.
61
- REF_LOOKUP_SPECIFIED_TYPE = 0x0001
62
- REF_LOOKUP_ROOT = 0x0002
63
- REF_LOOKUP_ROOT_FIRST = 0x0004 | REF_LOOKUP_ROOT
64
- REF_LOOKUP_SUBTYPES = 0x0008
65
- REF_LOOKUP_DEFAULT = REF_LOOKUP_SPECIFIED_TYPE | REF_LOOKUP_ROOT | REF_LOOKUP_SUBTYPES
66
-
67
51
  def __init__(self, file: Optional[str] = None, portal: Optional[Union[VirtualApp, TestApp, Portal]] = None,
68
52
  schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None,
69
53
  order: Optional[List[str]] = None, prune: bool = True,
@@ -76,6 +60,7 @@ class StructuredDataSet:
76
60
  self._portal = Portal(portal, data=self._data, schemas=schemas,
77
61
  ref_lookup_strategy=ref_lookup_strategy,
78
62
  ref_lookup_nocache=ref_lookup_nocache) if portal else None
63
+ self._ref_lookup_strategy = ref_lookup_strategy
79
64
  self._order = order
80
65
  self._prune = prune
81
66
  self._warnings = {}
@@ -199,25 +184,49 @@ class StructuredDataSet:
199
184
  upload_file["path"] = file_path
200
185
  return upload_files
201
186
 
202
- def compare(self) -> dict:
187
+ def compare(self, progress: Optional[Callable] = None) -> dict:
188
+ def get_counts() -> int:
189
+ ntypes = 0
190
+ nobjects = 0
191
+ if self.data:
192
+ ntypes = len(self.data)
193
+ for type_name in self.data:
194
+ nobjects += len(self.data[type_name])
195
+ return ntypes, nobjects
203
196
  diffs = {}
204
- if self.data or self.portal:
197
+ if callable(progress):
198
+ ntypes, nobjects = get_counts()
199
+ progress({"start": True, "types": ntypes, "objects": nobjects})
200
+ if self.data or self.portal: # TODO: what is this OR biz?
205
201
  refs = self.resolved_refs_with_uuids
206
- for object_type in self.data:
207
- if not diffs.get(object_type):
208
- diffs[object_type] = []
209
- for portal_object in self.data[object_type]:
210
- portal_object = PortalObject(portal_object, portal=self.portal, type=object_type)
211
- existing_object, identifying_path = portal_object.lookup(include_identifying_path=True, raw=True)
202
+ # TODO: Need feedback/progress tracking mechanism here.
203
+ # TODO: Check validity of reference; actually check that earlier on even maybe.
204
+ for type_name in self.data:
205
+ if not diffs.get(type_name):
206
+ diffs[type_name] = []
207
+ for portal_object in self.data[type_name]:
208
+ portal_object = PortalObject(portal_object, portal=self.portal, type=type_name)
209
+ existing_object, identifying_path, nlookups = (
210
+ portal_object.lookup(raw=True, ref_lookup_strategy=self._ref_lookup_strategy))
212
211
  if existing_object:
213
- object_diffs = portal_object.compare(existing_object, consider_refs=True, resolved_refs=refs)
214
- diffs[object_type].append(create_readonly_object(path=identifying_path,
215
- uuid=existing_object.uuid,
216
- diffs=object_diffs or None))
212
+ object_diffs, nlookups_compare = portal_object.compare(
213
+ existing_object, consider_refs=True, resolved_refs=refs)
214
+ diffs[type_name].append(create_readonly_object(path=identifying_path,
215
+ uuid=existing_object.uuid,
216
+ diffs=object_diffs or None))
217
+ if callable(progress):
218
+ progress({"update": True, "lookups": nlookups + nlookups_compare})
217
219
  elif identifying_path:
218
220
  # If there is no existing object we still create a record for this object
219
221
  # but with no uuid which will be the indication that it does not exist.
220
- diffs[object_type].append(create_readonly_object(path=identifying_path, uuid=None, diffs=None))
222
+ diffs[type_name].append(create_readonly_object(path=identifying_path, uuid=None, diffs=None))
223
+ if callable(progress):
224
+ progress({"create": True, "lookups": nlookups})
225
+ else:
226
+ if callable(progress):
227
+ progress({"lookups": nlookups})
228
+ if callable(progress):
229
+ progress({"finish": True})
221
230
  return diffs
222
231
 
223
232
  def _load_file(self, file: str) -> None:
@@ -251,7 +260,7 @@ class StructuredDataSet:
251
260
  self._load_reader(CsvReader(file), type_name=Schema.type_name(file))
252
261
 
253
262
  def _load_excel_file(self, file: str) -> None:
254
- def calculate_total_rows_to_process() -> Tuple[int, int]:
263
+ def get_counts() -> Tuple[int, int]:
255
264
  nonlocal file
256
265
  excel = Excel(file)
257
266
  nrows = 0
@@ -260,18 +269,27 @@ class StructuredDataSet:
260
269
  nrows += 1
261
270
  return nrows, len(excel.sheet_names)
262
271
  if self._progress:
263
- self._progress_update(calculate_total_rows_to_process)
272
+ nrows, nsheets = get_counts()
273
+ self._progress({"start": True, "sheets": nsheets, "rows": nrows})
274
+ """
275
+ if self._progress:
276
+ self._progress_update(get_counts)
277
+ """
264
278
  excel = Excel(file) # Order the sheet names by any specified ordering (e.g. ala snovault.loadxl).
265
279
  order = {Schema.type_name(key): index for index, key in enumerate(self._order)} if self._order else {}
266
280
  for sheet_name in sorted(excel.sheet_names, key=lambda key: order.get(Schema.type_name(key), sys.maxsize)):
267
281
  self._load_reader(excel.sheet_reader(sheet_name), type_name=Schema.type_name(sheet_name))
282
+ if self._progress:
283
+ self._progress({"finish": True})
284
+ # TODO: Do we really need progress reporting for the below?
268
285
  # Check for unresolved reference errors which really are not because of ordering.
269
286
  # Yes such internal references will be handled correctly on actual database update via snovault.loadxl.
270
287
  if ref_errors := self.ref_errors:
271
288
  ref_errors_actual = []
272
289
  for ref_error in ref_errors:
273
290
  if not (resolved := self.portal.ref_exists(ref := ref_error["error"])):
274
- # if not (resolved := self.portal.ref_exists_internally(ref := ref_error["error"])): # TODO
291
+ # TODO: Probably do this instead; and if so then no progress needed (per question above).
292
+ # if not (resolved := self.portal.ref_exists_internally(ref := ref_error["error"])):
275
293
  ref_errors_actual.append(ref_error)
276
294
  else:
277
295
  self._resolved_refs.add((ref, resolved.get("uuid")))
@@ -304,13 +322,15 @@ class StructuredDataSet:
304
322
  self._add_properties(structured_row, self._autoadd_properties, schema)
305
323
  self._add(type_name, structured_row)
306
324
  if self._progress:
307
- self._progress_update(-1,
308
- self.ref_total_count,
309
- self.ref_total_found_count,
310
- self.ref_total_notfound_count,
311
- self.ref_lookup_count,
312
- self.ref_lookup_cache_hit_count,
313
- self.ref_invalid_identifying_property_count)
325
+ self._progress({
326
+ "parse": True,
327
+ "refs": self.ref_total_count,
328
+ "refs_found": self.ref_total_found_count,
329
+ "refs_not_found": self.ref_total_notfound_count,
330
+ "refs_lookup": self.ref_lookup_count,
331
+ "refs_cache_hit": self.ref_lookup_cache_hit_count,
332
+ "refs_invalid": self.ref_invalid_identifying_property_count
333
+ })
314
334
  self._note_warning(reader.warnings, "reader")
315
335
  if schema:
316
336
  self._note_error(schema._unresolved_refs, "ref")
@@ -331,16 +351,16 @@ class StructuredDataSet:
331
351
 
332
352
  def _is_ref_lookup_specified_type(ref_lookup_flags: int) -> bool:
333
353
  return (ref_lookup_flags &
334
- StructuredDataSet.REF_LOOKUP_SPECIFIED_TYPE) == StructuredDataSet.REF_LOOKUP_SPECIFIED_TYPE
354
+ Portal.LOOKUP_SPECIFIED_TYPE) == Portal.LOOKUP_SPECIFIED_TYPE
335
355
 
336
356
  def _is_ref_lookup_root(ref_lookup_flags: int) -> bool:
337
- return (ref_lookup_flags & StructuredDataSet.REF_LOOKUP_ROOT) == StructuredDataSet.REF_LOOKUP_ROOT
357
+ return (ref_lookup_flags & Portal.LOOKUP_ROOT) == Portal.LOOKUP_ROOT
338
358
 
339
359
  def _is_ref_lookup_root_first(ref_lookup_flags: int) -> bool:
340
- return (ref_lookup_flags & StructuredDataSet.REF_LOOKUP_ROOT_FIRST) == StructuredDataSet.REF_LOOKUP_ROOT_FIRST
360
+ return (ref_lookup_flags & Portal.LOOKUP_ROOT_FIRST) == Portal.LOOKUP_ROOT_FIRST
341
361
 
342
362
  def _is_ref_lookup_subtypes(ref_lookup_flags: int) -> bool:
343
- return (ref_lookup_flags & StructuredDataSet.REF_LOOKUP_SUBTYPES) == StructuredDataSet.REF_LOOKUP_SUBTYPES
363
+ return (ref_lookup_flags & Portal.LOOKUP_SUBTYPES) == Portal.LOOKUP_SUBTYPES
344
364
 
345
365
  @property
346
366
  def ref_total_count(self) -> int:
@@ -786,7 +806,7 @@ class Portal(PortalBase):
786
806
  if callable(ref_lookup_strategy):
787
807
  self._ref_lookup_strategy = ref_lookup_strategy
788
808
  else:
789
- self._ref_lookup_strategy = lambda type_name, schema, value: (StructuredDataSet.REF_LOOKUP_DEFAULT, None)
809
+ self._ref_lookup_strategy = lambda type_name, schema, value: (Portal.LOOKUP_DEFAULT, None)
790
810
  if ref_lookup_nocache is True:
791
811
  self.ref_lookup = self.ref_lookup_uncached
792
812
  self._ref_cache = None
@@ -844,7 +864,7 @@ class Portal(PortalBase):
844
864
  return schemas
845
865
 
846
866
  @lru_cache(maxsize=64)
847
- def _get_schema_subtypes_names(self, type_name: str) -> List[str]:
867
+ def _get_schema_subtype_names(self, type_name: str) -> List[str]:
848
868
  if not (schemas_super_type_map := self.get_schemas_super_type_map()):
849
869
  return []
850
870
  return schemas_super_type_map.get(type_name, [])
@@ -907,7 +927,7 @@ class Portal(PortalBase):
907
927
  lookup_paths.append(f"/{type_name}/{value}")
908
928
  if is_ref_lookup_root and not is_ref_lookup_root_first:
909
929
  lookup_paths.append(f"/{value}")
910
- subtype_names = self._get_schema_subtypes_names(type_name) if is_ref_lookup_subtypes else []
930
+ subtype_names = self._get_schema_subtype_names(type_name) if is_ref_lookup_subtypes else []
911
931
  for subtype_name in subtype_names:
912
932
  lookup_paths.append(f"/{subtype_name}/{value}")
913
933
  if not lookup_paths:
@@ -946,7 +966,7 @@ class Portal(PortalBase):
946
966
  ref_lookup_strategy, ref_validator = (
947
967
  self._ref_lookup_strategy(type_name, self.get_schema(type_name), value))
948
968
  is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
949
- subtype_names = self._get_schema_subtypes_names(type_name) if is_ref_lookup_subtypes else []
969
+ subtype_names = self._get_schema_subtype_names(type_name) if is_ref_lookup_subtypes else []
950
970
  for type_name in [type_name] + subtype_names:
951
971
  is_resolved, resolved_item = self._ref_exists_single_internally(type_name, value)
952
972
  if is_resolved:
@@ -1008,7 +1028,7 @@ class Portal(PortalBase):
1008
1028
  if not is_uuid(property_value):
1009
1029
  return False
1010
1030
  return True
1011
- for schema_name in [type_name] + self._get_schema_subtypes_names(type_name):
1031
+ for schema_name in [type_name] + self._get_schema_subtype_names(type_name):
1012
1032
  if schema := self.get_schema(schema_name):
1013
1033
  if identifying_properties := schema.get("identifyingProperties"):
1014
1034
  for identifying_property in identifying_properties:
@@ -1033,7 +1053,7 @@ class Portal(PortalBase):
1033
1053
 
1034
1054
  def _cache_ref(self, type_name: str, value: str, resolved: List[str]) -> None:
1035
1055
  if self._ref_cache is not None:
1036
- subtype_names = self._get_schema_subtypes_names(type_name)
1056
+ subtype_names = self._get_schema_subtype_names(type_name)
1037
1057
  for type_name in [type_name] + subtype_names:
1038
1058
  self._ref_cache[f"/{type_name}/{value}"] = resolved
1039
1059
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dcicutils
3
- Version: 8.8.0.1b22
3
+ Version: 8.8.0.1b24
4
4
  Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
5
5
  Home-page: https://github.com/4dn-dcic/utils
6
6
  License: MIT
@@ -46,8 +46,8 @@ dcicutils/log_utils.py,sha256=7pWMc6vyrorUZQf-V-M3YC6zrPgNhuV_fzm9xqTPph0,10883
46
46
  dcicutils/misc_utils.py,sha256=zVc4urdVGgnWjQ4UQlrGH-URAzr2l_PwZWI3u_GJdFE,102210
47
47
  dcicutils/obfuscation_utils.py,sha256=fo2jOmDRC6xWpYX49u80bVNisqRRoPskFNX3ymFAmjw,5963
48
48
  dcicutils/opensearch_utils.py,sha256=V2exmFYW8Xl2_pGFixF4I2Cc549Opwe4PhFi5twC0M8,1017
49
- dcicutils/portal_object_utils.py,sha256=khp7E0nb-aXuZWkL8_siftSlYCTOMHAd_2xHStM-maY,13613
50
- dcicutils/portal_utils.py,sha256=OR1wJNiZVEsxcweArA4-K6yiuy7_bCqawxhZnuFsUtM,27686
49
+ dcicutils/portal_object_utils.py,sha256=7gteQ5CM6IVDfHx-UPFiOfeE1fJYOir_uwWdRTykExQ,15374
50
+ dcicutils/portal_utils.py,sha256=trM8L9J1CPXntdsKSL56hy7SMpftNNpIReAn5iihGes,30050
51
51
  dcicutils/project_utils.py,sha256=qPdCaFmWUVBJw4rw342iUytwdQC0P-XKpK4mhyIulMM,31250
52
52
  dcicutils/qa_checkers.py,sha256=cdXjeL0jCDFDLT8VR8Px78aS10hwNISOO5G_Zv2TZ6M,20534
53
53
  dcicutils/qa_utils.py,sha256=TT0SiJWiuxYvbsIyhK9VO4uV_suxhB6CpuC4qPacCzQ,160208
@@ -62,15 +62,15 @@ dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19
62
62
  dcicutils/sheet_utils.py,sha256=VlmzteONW5VF_Q4vo0yA5vesz1ViUah1MZ_yA1rwZ0M,33629
63
63
  dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
64
64
  dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
65
- dcicutils/structured_data.py,sha256=Yrpv1EgCOhwSaNk2ZMuzbgyxpmAVXpuJ6_4g4LMC2Q8,57180
65
+ dcicutils/structured_data.py,sha256=aXyLqYlgp5DInGiWw6WZk08vfvKI0IW2eWW57n0zhz0,57549
66
66
  dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
67
67
  dcicutils/tmpfile_utils.py,sha256=n95XF8dZVbQRSXBZTGToXXfSs3JUVRyN6c3ZZ0nhAWI,1403
68
68
  dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
69
69
  dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
70
70
  dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
71
71
  dcicutils/zip_utils.py,sha256=rnjNv_k6L9jT2SjDSgVXp4BEJYLtz9XN6Cl2Fy-tqnM,2027
72
- dcicutils-8.8.0.1b22.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
73
- dcicutils-8.8.0.1b22.dist-info/METADATA,sha256=LKPWDLxkixPVLiNnypRH46KEEcwkr_beWBf6hEwz22E,3357
74
- dcicutils-8.8.0.1b22.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
75
- dcicutils-8.8.0.1b22.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
76
- dcicutils-8.8.0.1b22.dist-info/RECORD,,
72
+ dcicutils-8.8.0.1b24.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
73
+ dcicutils-8.8.0.1b24.dist-info/METADATA,sha256=z5fS6VG0wLUCd3NTEvsMF5rdWwLg7TmWlTs1Syg-7Sk,3357
74
+ dcicutils-8.8.0.1b24.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
75
+ dcicutils-8.8.0.1b24.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
76
+ dcicutils-8.8.0.1b24.dist-info/RECORD,,