dcicutils 8.8.0.1b22__py3-none-any.whl → 8.8.0.1b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcicutils/portal_object_utils.py +113 -92
- dcicutils/portal_utils.py +45 -0
- dcicutils/structured_data.py +69 -49
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/METADATA +1 -1
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/RECORD +8 -8
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/LICENSE.txt +0 -0
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/WHEEL +0 -0
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/entry_points.txt +0 -0
dcicutils/portal_object_utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from copy import deepcopy
|
2
2
|
from functools import lru_cache
|
3
3
|
import re
|
4
|
-
from typing import Any, List, Optional, Tuple, Type, Union
|
4
|
+
from typing import Any, Callable, List, Optional, Tuple, Type, Union
|
5
5
|
from dcicutils.data_readers import RowReader
|
6
6
|
from dcicutils.misc_utils import create_readonly_object
|
7
7
|
from dcicutils.portal_utils import Portal
|
@@ -19,7 +19,7 @@ class PortalObject:
|
|
19
19
|
self._data = data if isinstance(data, dict) else {}
|
20
20
|
self._portal = portal if isinstance(portal, Portal) else None
|
21
21
|
self._schema = schema if isinstance(schema, dict) else (schema.data if isinstance(schema, Schema) else None)
|
22
|
-
self._type = type if isinstance(type, str)
|
22
|
+
self._type = type if isinstance(type, str) else ""
|
23
23
|
|
24
24
|
@property
|
25
25
|
def data(self) -> dict:
|
@@ -31,8 +31,8 @@ class PortalObject:
|
|
31
31
|
|
32
32
|
@property
|
33
33
|
@lru_cache(maxsize=1)
|
34
|
-
def type(self) ->
|
35
|
-
return self._type or Portal.get_schema_type(self._data) or (Schema(self._schema).type if self._schema else
|
34
|
+
def type(self) -> str:
|
35
|
+
return self._type or Portal.get_schema_type(self._data) or (Schema(self._schema).type if self._schema else "")
|
36
36
|
|
37
37
|
@property
|
38
38
|
@lru_cache(maxsize=1)
|
@@ -75,86 +75,42 @@ class PortalObject:
|
|
75
75
|
identifying_properties.append("aliases")
|
76
76
|
return identifying_properties or None
|
77
77
|
|
78
|
-
@
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
"""
|
84
|
-
identifying_paths = []
|
85
|
-
if not (identifying_properties := self.identifying_properties):
|
86
|
-
if self.uuid:
|
87
|
-
if self.type:
|
88
|
-
identifying_paths.append(f"/{self.type}/{self.uuid}")
|
89
|
-
identifying_paths.append(f"/{self.uuid}")
|
90
|
-
return identifying_paths
|
91
|
-
for identifying_property in identifying_properties:
|
92
|
-
if (identifying_value := self._data.get(identifying_property)):
|
93
|
-
if identifying_property == "uuid":
|
94
|
-
identifying_paths.append(f"/{self.type}/{identifying_value}")
|
95
|
-
identifying_paths.append(f"/{identifying_value}")
|
96
|
-
# For now at least we include the path both with and without the schema type component,
|
97
|
-
# as for some identifying values, it works (only) with, and some, it works (only) without.
|
98
|
-
# For example: If we have FileSet with "accession", an identifying property, with value
|
99
|
-
# SMAFSFXF1RO4 then /SMAFSFXF1RO4 works but /FileSet/SMAFSFXF1RO4 does not; and
|
100
|
-
# conversely using "submitted_id", also an identifying property, with value
|
101
|
-
# UW_FILE-SET_COLO-829BL_HI-C_1 then /UW_FILE-SET_COLO-829BL_HI-C_1 does
|
102
|
-
# not work but /FileSet/UW_FILE-SET_COLO-829BL_HI-C_1 does work.
|
103
|
-
elif isinstance(identifying_value, list):
|
104
|
-
for identifying_value_item in identifying_value:
|
105
|
-
if self.type:
|
106
|
-
identifying_paths.append(f"/{self.type}/{identifying_value_item}")
|
107
|
-
identifying_paths.append(f"/{identifying_value_item}")
|
108
|
-
else:
|
109
|
-
if (schema := self.schema):
|
110
|
-
if pattern := schema.get("properties", {}).get(identifying_property, {}).get("pattern"):
|
111
|
-
if not re.match(pattern, identifying_value):
|
112
|
-
# If this identifying value is for a (identifying) property which has a
|
113
|
-
# pattern, and the value does NOT match the pattern, then do NOT include
|
114
|
-
# this value as an identifying path, since it cannot possibly be found.
|
115
|
-
continue
|
116
|
-
if self.type:
|
117
|
-
identifying_paths.append(f"/{self.type}/{identifying_value}")
|
118
|
-
identifying_paths.append(f"/{identifying_value}")
|
119
|
-
return identifying_paths or None
|
120
|
-
|
121
|
-
@property
|
122
|
-
@lru_cache(maxsize=1)
|
123
|
-
def identifying_path(self) -> Optional[str]:
|
124
|
-
if identifying_paths := self.identifying_paths:
|
125
|
-
return identifying_paths[0]
|
126
|
-
|
127
|
-
def lookup(self, include_identifying_path: bool = False,
|
128
|
-
raw: bool = False) -> Optional[Union[Tuple[PortalObject, str], PortalObject]]:
|
129
|
-
return self._lookup(raw=raw) if include_identifying_path else self._lookup(raw=raw)[0]
|
130
|
-
|
131
|
-
def lookup_identifying_path(self) -> Optional[str]:
|
132
|
-
return self._lookup()[1]
|
133
|
-
|
134
|
-
def _lookup(self, raw: bool = False) -> Tuple[Optional[PortalObject], Optional[str]]:
|
78
|
+
@lru_cache(maxsize=8192)
|
79
|
+
def lookup(self, raw: bool = False,
|
80
|
+
ref_lookup_strategy: Optional[Callable] = None) -> Tuple[Optional[PortalObject], Optional[str], int]:
|
81
|
+
nlookups = 0
|
82
|
+
first_identifying_path = None
|
135
83
|
try:
|
136
|
-
if identifying_paths := self.
|
84
|
+
if identifying_paths := self._get_identifying_paths(ref_lookup_strategy=ref_lookup_strategy):
|
137
85
|
for identifying_path in identifying_paths:
|
86
|
+
if not first_identifying_path:
|
87
|
+
first_identifying_path = identifying_path
|
88
|
+
nlookups += 1
|
138
89
|
if (value := self._portal.get(identifying_path, raw=raw)) and (value.status_code == 200):
|
139
|
-
return
|
140
|
-
|
90
|
+
return (
|
91
|
+
PortalObject(value.json(), portal=self._portal, type=self.type if raw else None),
|
92
|
+
identifying_path,
|
93
|
+
nlookups
|
94
|
+
)
|
141
95
|
except Exception:
|
142
96
|
pass
|
143
|
-
return None,
|
97
|
+
return None, first_identifying_path, nlookups
|
144
98
|
|
145
99
|
def compare(self, value: Union[dict, PortalObject],
|
146
|
-
consider_refs: bool = False, resolved_refs: List[dict] = None) -> dict:
|
100
|
+
consider_refs: bool = False, resolved_refs: List[dict] = None) -> Tuple[dict, int]:
|
147
101
|
if consider_refs and isinstance(resolved_refs, list):
|
148
|
-
|
102
|
+
normlized_portal_object, nlookups = self._normalized_refs(refs=resolved_refs)
|
103
|
+
this_data = normlized_portal_object.data
|
149
104
|
else:
|
150
105
|
this_data = self.data
|
106
|
+
nlookups = 0
|
151
107
|
if isinstance(value, PortalObject):
|
152
108
|
comparing_data = value.data
|
153
109
|
elif isinstance(value, dict):
|
154
110
|
comparing_data = value
|
155
111
|
else:
|
156
|
-
return {}
|
157
|
-
return PortalObject._compare(this_data, comparing_data)
|
112
|
+
return {}, nlookups
|
113
|
+
return PortalObject._compare(this_data, comparing_data), nlookups
|
158
114
|
|
159
115
|
@staticmethod
|
160
116
|
def _compare(a: Any, b: Any, _path: Optional[str] = None) -> dict:
|
@@ -201,42 +157,106 @@ class PortalObject:
|
|
201
157
|
diffs[_path] = diff_updating(a, b)
|
202
158
|
return diffs
|
203
159
|
|
204
|
-
|
160
|
+
@lru_cache(maxsize=1)
|
161
|
+
def _get_identifying_paths(self, ref_lookup_strategy: Optional[Callable] = None) -> Optional[List[str]]:
|
205
162
|
"""
|
206
|
-
|
207
|
-
this Portal object into the uuid style reference (e.g. d1b67068-300f-483f-bfe8-63d23c93801f),
|
208
|
-
based on the given "refs" list which is assumed to be a list of dictionaries, where each
|
209
|
-
contains a "path" and a "uuid" property; this list is typically (for our first usage of
|
210
|
-
this function) the value of structured_data.StructuredDataSet.resolved_refs_with_uuid.
|
211
|
-
Changes are made to this Portal object in place; use normalized_refs function to make a copy.
|
212
|
-
If there are no "refs" (None or empty) or if the speicified reference is not found in this
|
213
|
-
list then the references will be looked up via Portal calls (via Portal.get_metadata).
|
163
|
+
Returns a list of the possible Portal URL paths identifying this Portal object.
|
214
164
|
"""
|
215
|
-
|
165
|
+
identifying_paths = []
|
166
|
+
if not (identifying_properties := self.identifying_properties):
|
167
|
+
if self.uuid:
|
168
|
+
if self.type:
|
169
|
+
identifying_paths.append(f"/{self.type}/{self.uuid}")
|
170
|
+
identifying_paths.append(f"/{self.uuid}")
|
171
|
+
return identifying_paths
|
172
|
+
for identifying_property in identifying_properties:
|
173
|
+
if identifying_value := self._data.get(identifying_property):
|
174
|
+
if identifying_property == "uuid":
|
175
|
+
if self.type:
|
176
|
+
identifying_paths.append(f"/{self.type}/{identifying_value}")
|
177
|
+
identifying_paths.append(f"/{identifying_value}")
|
178
|
+
# For now at least we include the path both with and without the schema type component,
|
179
|
+
# as for some identifying values, it works (only) with, and some, it works (only) without.
|
180
|
+
# For example: If we have FileSet with "accession", an identifying property, with value
|
181
|
+
# SMAFSFXF1RO4 then /SMAFSFXF1RO4 works but /FileSet/SMAFSFXF1RO4 does not; and
|
182
|
+
# conversely using "submitted_id", also an identifying property, with value
|
183
|
+
# UW_FILE-SET_COLO-829BL_HI-C_1 then /UW_FILE-SET_COLO-829BL_HI-C_1 does
|
184
|
+
# not work but /FileSet/UW_FILE-SET_COLO-829BL_HI-C_1 does work.
|
185
|
+
elif isinstance(identifying_value, list):
|
186
|
+
for identifying_value_item in identifying_value:
|
187
|
+
if self.type:
|
188
|
+
identifying_paths.append(f"/{self.type}/{identifying_value_item}")
|
189
|
+
identifying_paths.append(f"/{identifying_value_item}")
|
190
|
+
else:
|
191
|
+
# TODO: Import from somewhere ...
|
192
|
+
lookup_options = 0
|
193
|
+
if schema := self.schema:
|
194
|
+
# TODO: Hook into the ref_lookup_strategy thing in structured_data to make
|
195
|
+
# sure we check accession format (since it does not have a pattern).
|
196
|
+
if callable(ref_lookup_strategy):
|
197
|
+
lookup_options, ref_validator = ref_lookup_strategy(
|
198
|
+
self.type, schema, identifying_value)
|
199
|
+
if callable(ref_validator):
|
200
|
+
if ref_validator(schema, identifying_property, identifying_value) is False:
|
201
|
+
continue
|
202
|
+
if pattern := schema.get("properties", {}).get(identifying_property, {}).get("pattern"):
|
203
|
+
if not re.match(pattern, identifying_value):
|
204
|
+
# If this identifying value is for a (identifying) property which has a
|
205
|
+
# pattern, and the value does NOT match the pattern, then do NOT include
|
206
|
+
# this value as an identifying path, since it cannot possibly be found.
|
207
|
+
continue
|
208
|
+
if not lookup_options:
|
209
|
+
lookup_options = Portal.LOOKUP_DEFAULT
|
210
|
+
if Portal.is_lookup_root_first(lookup_options):
|
211
|
+
identifying_paths.append(f"/{identifying_value}")
|
212
|
+
if Portal.is_lookup_specified_type(lookup_options) and self.type:
|
213
|
+
identifying_paths.append(f"/{self.type}/{identifying_value}")
|
214
|
+
if Portal.is_lookup_root(lookup_options) and not Portal.is_lookup_root_first(lookup_options):
|
215
|
+
identifying_paths.append(f"/{identifying_value}")
|
216
|
+
if Portal.is_lookup_subtypes(lookup_options):
|
217
|
+
for subtype_name in self._portal.get_schema_subtype_names(self.type):
|
218
|
+
identifying_paths.append(f"/{subtype_name}/{identifying_value}")
|
219
|
+
return identifying_paths or None
|
216
220
|
|
217
|
-
def
|
221
|
+
def _normalized_refs(self, refs: List[dict]) -> Tuple[PortalObject, int]:
|
218
222
|
"""
|
219
|
-
Same as
|
223
|
+
Same as _normalize_ref but does NOT make this change to this Portal object IN PLACE,
|
220
224
|
rather it returns a new instance of this Portal object wrapped in a new PortalObject.
|
221
225
|
"""
|
222
226
|
portal_object = self.copy()
|
223
|
-
portal_object.
|
224
|
-
return portal_object
|
227
|
+
nlookups = portal_object._normalize_refs(refs)
|
228
|
+
return portal_object, nlookups
|
229
|
+
|
230
|
+
def _normalize_refs(self, refs: List[dict]) -> int:
|
231
|
+
"""
|
232
|
+
Turns any (linkTo) references which are paths (e.g. /SubmissionCenter/uwsc_gcc) within this
|
233
|
+
object IN PLACE into the uuid style reference (e.g. d1b67068-300f-483f-bfe8-63d23c93801f),
|
234
|
+
based on the given "refs" list which is assumed to be a list of dictionaries, where each
|
235
|
+
contains a "path" and a "uuid" property; this list is typically (for our first usage of
|
236
|
+
this function) the value of structured_data.StructuredDataSet.resolved_refs_with_uuid.
|
237
|
+
Changes are made to this Portal object IN PLACE; use _normalized_refs function to make a copy.
|
238
|
+
If there are no "refs" (None or empty) or if the speicified reference is not found in this
|
239
|
+
list then the references will be looked up via Portal calls (via Portal.get_metadata).
|
240
|
+
"""
|
241
|
+
_, nlookups = PortalObject._normalize_data_refs(self.data, refs=refs, schema=self.schema, portal=self.portal)
|
242
|
+
return nlookups
|
225
243
|
|
226
244
|
@staticmethod
|
227
|
-
def
|
245
|
+
def _normalize_data_refs(value: Any, refs: List[dict], schema: dict,
|
246
|
+
portal: Portal, _path: Optional[str] = None) -> Tuple[Any, int]:
|
247
|
+
nlookups = 0
|
228
248
|
if not value or not isinstance(schema, dict):
|
229
|
-
return value
|
249
|
+
return value, nlookups
|
230
250
|
if isinstance(value, dict):
|
231
251
|
for key in value:
|
232
252
|
path = f"{_path}.{key}" if _path else key
|
233
|
-
value[key] = PortalObject.
|
234
|
-
|
253
|
+
value[key], nlookups = PortalObject._normalize_data_refs(value[key], refs=refs,
|
254
|
+
schema=schema, portal=portal, _path=path)
|
235
255
|
elif isinstance(value, list):
|
236
256
|
for index in range(len(value)):
|
237
257
|
path = f"{_path or ''}#{index}"
|
238
|
-
value[index] = PortalObject.
|
239
|
-
|
258
|
+
value[index], nlookups = PortalObject._normalize_data_refs(value[index], refs=refs,
|
259
|
+
schema=schema, portal=portal, _path=path)
|
240
260
|
elif value_type := Schema.get_property_by_path(schema, _path):
|
241
261
|
if link_to := value_type.get("linkTo"):
|
242
262
|
ref_path = f"/{link_to}/{value}"
|
@@ -247,7 +267,7 @@ class PortalObject:
|
|
247
267
|
else:
|
248
268
|
ref_uuid = None
|
249
269
|
if ref_uuid:
|
250
|
-
return ref_uuid
|
270
|
+
return ref_uuid, nlookups
|
251
271
|
# Here our (linkTo) reference appears not to be in the given refs; if these refs came
|
252
272
|
# from structured_data.StructuredDataSet.resolved_refs_with_uuid (in the context of
|
253
273
|
# smaht-submitr, which is the typical/first use case for this function) then this could
|
@@ -255,6 +275,7 @@ class PortalObject:
|
|
255
275
|
# the data/spreadsheet being submitted. In any case, we don't have the associated uuid
|
256
276
|
# so let us look it up here.
|
257
277
|
if isinstance(portal, Portal):
|
278
|
+
nlookups += 1
|
258
279
|
if (ref_object := portal.get_metadata(ref_path)) and (ref_uuid := ref_object.get("uuid")):
|
259
|
-
return ref_uuid
|
260
|
-
return value
|
280
|
+
return ref_uuid, nlookups
|
281
|
+
return value, nlookups
|
dcicutils/portal_utils.py
CHANGED
@@ -46,6 +46,22 @@ class Portal:
|
|
46
46
|
KEYS_FILE_DIRECTORY = "~"
|
47
47
|
MIME_TYPE_JSON = "application/json"
|
48
48
|
|
49
|
+
# Object lookup strategies; on a per-reference (type/value) basis, used currently ONLY by
|
50
|
+
# structured_data.py; controlled by an optional ref_lookup_strategy callable; default is
|
51
|
+
# lookup at root path but after the specified type path lookup, and then lookup all subtypes;
|
52
|
+
# can choose to lookup root path first, or not lookup root path at all, or not lookup
|
53
|
+
# subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
|
54
|
+
# and value (string) arguements and return an integer of any of the below ORed together.
|
55
|
+
# The main purpose of this is optimization; to minimize portal lookups; since for example,
|
56
|
+
# currently at least, /{type}/{accession} does not work but /{accession} does; so we
|
57
|
+
# currently (smaht-portal/.../ingestion_processors) use LOOKUP_ROOT_FIRST for this.
|
58
|
+
# And current usage NEVER has LOOKUP_SUBTYPES turned OFF; but support just in case.
|
59
|
+
LOOKUP_SPECIFIED_TYPE = 0x0001
|
60
|
+
LOOKUP_ROOT = 0x0002
|
61
|
+
LOOKUP_ROOT_FIRST = 0x0004 | LOOKUP_ROOT
|
62
|
+
LOOKUP_SUBTYPES = 0x0008
|
63
|
+
LOOKUP_DEFAULT = LOOKUP_SPECIFIED_TYPE | LOOKUP_ROOT | LOOKUP_SUBTYPES
|
64
|
+
|
49
65
|
def __init__(self,
|
50
66
|
arg: Optional[Union[Portal, TestApp, VirtualApp, PyramidRouter, dict, tuple, str]] = None,
|
51
67
|
env: Optional[str] = None, server: Optional[str] = None,
|
@@ -188,9 +204,27 @@ class Portal:
|
|
188
204
|
def vapp(self) -> Optional[TestApp]:
|
189
205
|
return self._vapp
|
190
206
|
|
207
|
+
@staticmethod
|
208
|
+
def is_lookup_specified_type(lookup_options: int) -> bool:
|
209
|
+
return (lookup_options &
|
210
|
+
Portal.LOOKUP_SPECIFIED_TYPE) == Portal.LOOKUP_SPECIFIED_TYPE
|
211
|
+
|
212
|
+
@staticmethod
|
213
|
+
def is_lookup_root(lookup_options: int) -> bool:
|
214
|
+
return (lookup_options & Portal.LOOKUP_ROOT) == Portal.LOOKUP_ROOT
|
215
|
+
|
216
|
+
@staticmethod
|
217
|
+
def is_lookup_root_first(lookup_options: int) -> bool:
|
218
|
+
return (lookup_options & Portal.LOOKUP_ROOT_FIRST) == Portal.LOOKUP_ROOT_FIRST
|
219
|
+
|
220
|
+
@staticmethod
|
221
|
+
def is_lookup_subtypes(lookup_options: int) -> bool:
|
222
|
+
return (lookup_options & Portal.LOOKUP_SUBTYPES) == Portal.LOOKUP_SUBTYPES
|
223
|
+
|
191
224
|
def get(self, url: str, follow: bool = True,
|
192
225
|
raw: bool = False, database: bool = False, raise_for_status: bool = False, **kwargs) -> OptionalResponse:
|
193
226
|
url = self.url(url, raw, database)
|
227
|
+
# print(f'xyzzy.portal.get({url})')
|
194
228
|
if not self.vapp:
|
195
229
|
response = requests.get(url, allow_redirects=follow, **self._kwargs(**kwargs))
|
196
230
|
else:
|
@@ -205,6 +239,7 @@ class Portal:
|
|
205
239
|
def patch(self, url: str, data: Optional[dict] = None, json: Optional[dict] = None,
|
206
240
|
raise_for_status: bool = False, **kwargs) -> OptionalResponse:
|
207
241
|
url = self.url(url)
|
242
|
+
# print(f'xyzzy.portal.patch({url})')
|
208
243
|
if not self.vapp:
|
209
244
|
response = requests.patch(url, data=data, json=json, **self._kwargs(**kwargs))
|
210
245
|
else:
|
@@ -217,6 +252,7 @@ class Portal:
|
|
217
252
|
def post(self, url: str, data: Optional[dict] = None, json: Optional[dict] = None, files: Optional[dict] = None,
|
218
253
|
raise_for_status: bool = False, **kwargs) -> OptionalResponse:
|
219
254
|
url = self.url(url)
|
255
|
+
# print(f'xyzzy.portal.post({url})')
|
220
256
|
if files and not ("headers" in kwargs):
|
221
257
|
# Setting headers to None when using files implies content-type multipart/form-data.
|
222
258
|
kwargs["headers"] = None
|
@@ -233,6 +269,7 @@ class Portal:
|
|
233
269
|
return response
|
234
270
|
|
235
271
|
def get_metadata(self, object_id: str, raw: bool = False, database: bool = False) -> Optional[dict]:
|
272
|
+
# print(f'xyzzy.portal.get_metadata({object_id})')
|
236
273
|
if isinstance(raw, bool) and raw:
|
237
274
|
add_on = "frame=raw" + ("&datastore=database" if isinstance(database, bool) and database else "")
|
238
275
|
elif database:
|
@@ -242,11 +279,13 @@ class Portal:
|
|
242
279
|
return get_metadata(obj_id=object_id, vapp=self.vapp, key=self.key, add_on=add_on)
|
243
280
|
|
244
281
|
def patch_metadata(self, object_id: str, data: dict) -> Optional[dict]:
|
282
|
+
# print(f'xyzzy.portal.patch_metadata({object_id})')
|
245
283
|
if self.key:
|
246
284
|
return patch_metadata(obj_id=object_id, patch_item=data, key=self.key)
|
247
285
|
return self.patch(f"/{object_id}", data).json()
|
248
286
|
|
249
287
|
def post_metadata(self, object_type: str, data: dict) -> Optional[dict]:
|
288
|
+
# print(f'xyzzy.portal.post_metadata({object_id})')
|
250
289
|
if self.key:
|
251
290
|
return post_metadata(schema_name=object_type, post_item=data, key=self.key)
|
252
291
|
return self.post(f"/{object_type}", data).json()
|
@@ -358,6 +397,12 @@ class Portal:
|
|
358
397
|
super_type_map_flattened[super_type_name] = list_breadth_first(super_type_map, super_type_name)
|
359
398
|
return super_type_map_flattened
|
360
399
|
|
400
|
+
@lru_cache(maxsize=64)
|
401
|
+
def get_schema_subtype_names(self, type_name: str) -> List[str]:
|
402
|
+
if not (schemas_super_type_map := self.get_schemas_super_type_map()):
|
403
|
+
return []
|
404
|
+
return schemas_super_type_map.get(type_name, [])
|
405
|
+
|
361
406
|
def url(self, url: str, raw: bool = False, database: bool = False) -> str:
|
362
407
|
if not isinstance(url, str) or not url:
|
363
408
|
return "/"
|
dcicutils/structured_data.py
CHANGED
@@ -48,22 +48,6 @@ StructuredDataSet = Type["StructuredDataSet"]
|
|
48
48
|
|
49
49
|
class StructuredDataSet:
|
50
50
|
|
51
|
-
# Reference (linkTo) lookup strategies; on a per-reference (type/value) basis;
|
52
|
-
# controlled by optional ref_lookup_strategy callable; default is lookup at root path
|
53
|
-
# but after the named reference (linkTo) type path lookup, and then lookup all subtypes;
|
54
|
-
# can choose to lookup root path first, or not lookup root path at all, or not lookup
|
55
|
-
# subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
|
56
|
-
# and value (string) arguements and return an integer of any of the below ORed together.
|
57
|
-
# The main purpose of this is optimization; to minimize portal lookups; since for example,
|
58
|
-
# currently at least, /{type}/{accession} does not work but /{accession} does; so we
|
59
|
-
# currently (smaht-portal/.../ingestion_processors) use REF_LOOKUP_ROOT_FIRST for this.
|
60
|
-
# And current usage NEVER has REF_LOOKUP_SUBTYPES turned OFF; but support just in case.
|
61
|
-
REF_LOOKUP_SPECIFIED_TYPE = 0x0001
|
62
|
-
REF_LOOKUP_ROOT = 0x0002
|
63
|
-
REF_LOOKUP_ROOT_FIRST = 0x0004 | REF_LOOKUP_ROOT
|
64
|
-
REF_LOOKUP_SUBTYPES = 0x0008
|
65
|
-
REF_LOOKUP_DEFAULT = REF_LOOKUP_SPECIFIED_TYPE | REF_LOOKUP_ROOT | REF_LOOKUP_SUBTYPES
|
66
|
-
|
67
51
|
def __init__(self, file: Optional[str] = None, portal: Optional[Union[VirtualApp, TestApp, Portal]] = None,
|
68
52
|
schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None,
|
69
53
|
order: Optional[List[str]] = None, prune: bool = True,
|
@@ -76,6 +60,7 @@ class StructuredDataSet:
|
|
76
60
|
self._portal = Portal(portal, data=self._data, schemas=schemas,
|
77
61
|
ref_lookup_strategy=ref_lookup_strategy,
|
78
62
|
ref_lookup_nocache=ref_lookup_nocache) if portal else None
|
63
|
+
self._ref_lookup_strategy = ref_lookup_strategy
|
79
64
|
self._order = order
|
80
65
|
self._prune = prune
|
81
66
|
self._warnings = {}
|
@@ -199,25 +184,49 @@ class StructuredDataSet:
|
|
199
184
|
upload_file["path"] = file_path
|
200
185
|
return upload_files
|
201
186
|
|
202
|
-
def compare(self) -> dict:
|
187
|
+
def compare(self, progress: Optional[Callable] = None) -> dict:
|
188
|
+
def get_counts() -> int:
|
189
|
+
ntypes = 0
|
190
|
+
nobjects = 0
|
191
|
+
if self.data:
|
192
|
+
ntypes = len(self.data)
|
193
|
+
for type_name in self.data:
|
194
|
+
nobjects += len(self.data[type_name])
|
195
|
+
return ntypes, nobjects
|
203
196
|
diffs = {}
|
204
|
-
if
|
197
|
+
if callable(progress):
|
198
|
+
ntypes, nobjects = get_counts()
|
199
|
+
progress({"start": True, "types": ntypes, "objects": nobjects})
|
200
|
+
if self.data or self.portal: # TODO: what is this OR biz?
|
205
201
|
refs = self.resolved_refs_with_uuids
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
202
|
+
# TODO: Need feedback/progress tracking mechanism here.
|
203
|
+
# TODO: Check validity of reference; actually check that earlier on even maybe.
|
204
|
+
for type_name in self.data:
|
205
|
+
if not diffs.get(type_name):
|
206
|
+
diffs[type_name] = []
|
207
|
+
for portal_object in self.data[type_name]:
|
208
|
+
portal_object = PortalObject(portal_object, portal=self.portal, type=type_name)
|
209
|
+
existing_object, identifying_path, nlookups = (
|
210
|
+
portal_object.lookup(raw=True, ref_lookup_strategy=self._ref_lookup_strategy))
|
212
211
|
if existing_object:
|
213
|
-
object_diffs = portal_object.compare(
|
214
|
-
|
215
|
-
|
216
|
-
|
212
|
+
object_diffs, nlookups_compare = portal_object.compare(
|
213
|
+
existing_object, consider_refs=True, resolved_refs=refs)
|
214
|
+
diffs[type_name].append(create_readonly_object(path=identifying_path,
|
215
|
+
uuid=existing_object.uuid,
|
216
|
+
diffs=object_diffs or None))
|
217
|
+
if callable(progress):
|
218
|
+
progress({"update": True, "lookups": nlookups + nlookups_compare})
|
217
219
|
elif identifying_path:
|
218
220
|
# If there is no existing object we still create a record for this object
|
219
221
|
# but with no uuid which will be the indication that it does not exist.
|
220
|
-
diffs[
|
222
|
+
diffs[type_name].append(create_readonly_object(path=identifying_path, uuid=None, diffs=None))
|
223
|
+
if callable(progress):
|
224
|
+
progress({"create": True, "lookups": nlookups})
|
225
|
+
else:
|
226
|
+
if callable(progress):
|
227
|
+
progress({"lookups": nlookups})
|
228
|
+
if callable(progress):
|
229
|
+
progress({"finish": True})
|
221
230
|
return diffs
|
222
231
|
|
223
232
|
def _load_file(self, file: str) -> None:
|
@@ -251,7 +260,7 @@ class StructuredDataSet:
|
|
251
260
|
self._load_reader(CsvReader(file), type_name=Schema.type_name(file))
|
252
261
|
|
253
262
|
def _load_excel_file(self, file: str) -> None:
|
254
|
-
def
|
263
|
+
def get_counts() -> Tuple[int, int]:
|
255
264
|
nonlocal file
|
256
265
|
excel = Excel(file)
|
257
266
|
nrows = 0
|
@@ -260,18 +269,27 @@ class StructuredDataSet:
|
|
260
269
|
nrows += 1
|
261
270
|
return nrows, len(excel.sheet_names)
|
262
271
|
if self._progress:
|
263
|
-
|
272
|
+
nrows, nsheets = get_counts()
|
273
|
+
self._progress({"start": True, "sheets": nsheets, "rows": nrows})
|
274
|
+
"""
|
275
|
+
if self._progress:
|
276
|
+
self._progress_update(get_counts)
|
277
|
+
"""
|
264
278
|
excel = Excel(file) # Order the sheet names by any specified ordering (e.g. ala snovault.loadxl).
|
265
279
|
order = {Schema.type_name(key): index for index, key in enumerate(self._order)} if self._order else {}
|
266
280
|
for sheet_name in sorted(excel.sheet_names, key=lambda key: order.get(Schema.type_name(key), sys.maxsize)):
|
267
281
|
self._load_reader(excel.sheet_reader(sheet_name), type_name=Schema.type_name(sheet_name))
|
282
|
+
if self._progress:
|
283
|
+
self._progress({"finish": True})
|
284
|
+
# TODO: Do we really need progress reporting for the below?
|
268
285
|
# Check for unresolved reference errors which really are not because of ordering.
|
269
286
|
# Yes such internal references will be handled correctly on actual database update via snovault.loadxl.
|
270
287
|
if ref_errors := self.ref_errors:
|
271
288
|
ref_errors_actual = []
|
272
289
|
for ref_error in ref_errors:
|
273
290
|
if not (resolved := self.portal.ref_exists(ref := ref_error["error"])):
|
274
|
-
# if
|
291
|
+
# TODO: Probably do this instead; and if so then no progress needed (per question above).
|
292
|
+
# if not (resolved := self.portal.ref_exists_internally(ref := ref_error["error"])):
|
275
293
|
ref_errors_actual.append(ref_error)
|
276
294
|
else:
|
277
295
|
self._resolved_refs.add((ref, resolved.get("uuid")))
|
@@ -304,13 +322,15 @@ class StructuredDataSet:
|
|
304
322
|
self._add_properties(structured_row, self._autoadd_properties, schema)
|
305
323
|
self._add(type_name, structured_row)
|
306
324
|
if self._progress:
|
307
|
-
self.
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
325
|
+
self._progress({
|
326
|
+
"parse": True,
|
327
|
+
"refs": self.ref_total_count,
|
328
|
+
"refs_found": self.ref_total_found_count,
|
329
|
+
"refs_not_found": self.ref_total_notfound_count,
|
330
|
+
"refs_lookup": self.ref_lookup_count,
|
331
|
+
"refs_cache_hit": self.ref_lookup_cache_hit_count,
|
332
|
+
"refs_invalid": self.ref_invalid_identifying_property_count
|
333
|
+
})
|
314
334
|
self._note_warning(reader.warnings, "reader")
|
315
335
|
if schema:
|
316
336
|
self._note_error(schema._unresolved_refs, "ref")
|
@@ -331,16 +351,16 @@ class StructuredDataSet:
|
|
331
351
|
|
332
352
|
def _is_ref_lookup_specified_type(ref_lookup_flags: int) -> bool:
|
333
353
|
return (ref_lookup_flags &
|
334
|
-
|
354
|
+
Portal.LOOKUP_SPECIFIED_TYPE) == Portal.LOOKUP_SPECIFIED_TYPE
|
335
355
|
|
336
356
|
def _is_ref_lookup_root(ref_lookup_flags: int) -> bool:
|
337
|
-
return (ref_lookup_flags &
|
357
|
+
return (ref_lookup_flags & Portal.LOOKUP_ROOT) == Portal.LOOKUP_ROOT
|
338
358
|
|
339
359
|
def _is_ref_lookup_root_first(ref_lookup_flags: int) -> bool:
|
340
|
-
return (ref_lookup_flags &
|
360
|
+
return (ref_lookup_flags & Portal.LOOKUP_ROOT_FIRST) == Portal.LOOKUP_ROOT_FIRST
|
341
361
|
|
342
362
|
def _is_ref_lookup_subtypes(ref_lookup_flags: int) -> bool:
|
343
|
-
return (ref_lookup_flags &
|
363
|
+
return (ref_lookup_flags & Portal.LOOKUP_SUBTYPES) == Portal.LOOKUP_SUBTYPES
|
344
364
|
|
345
365
|
@property
|
346
366
|
def ref_total_count(self) -> int:
|
@@ -786,7 +806,7 @@ class Portal(PortalBase):
|
|
786
806
|
if callable(ref_lookup_strategy):
|
787
807
|
self._ref_lookup_strategy = ref_lookup_strategy
|
788
808
|
else:
|
789
|
-
self._ref_lookup_strategy = lambda type_name, schema, value: (
|
809
|
+
self._ref_lookup_strategy = lambda type_name, schema, value: (Portal.LOOKUP_DEFAULT, None)
|
790
810
|
if ref_lookup_nocache is True:
|
791
811
|
self.ref_lookup = self.ref_lookup_uncached
|
792
812
|
self._ref_cache = None
|
@@ -844,7 +864,7 @@ class Portal(PortalBase):
|
|
844
864
|
return schemas
|
845
865
|
|
846
866
|
@lru_cache(maxsize=64)
|
847
|
-
def
|
867
|
+
def _get_schema_subtype_names(self, type_name: str) -> List[str]:
|
848
868
|
if not (schemas_super_type_map := self.get_schemas_super_type_map()):
|
849
869
|
return []
|
850
870
|
return schemas_super_type_map.get(type_name, [])
|
@@ -907,7 +927,7 @@ class Portal(PortalBase):
|
|
907
927
|
lookup_paths.append(f"/{type_name}/{value}")
|
908
928
|
if is_ref_lookup_root and not is_ref_lookup_root_first:
|
909
929
|
lookup_paths.append(f"/{value}")
|
910
|
-
subtype_names = self.
|
930
|
+
subtype_names = self._get_schema_subtype_names(type_name) if is_ref_lookup_subtypes else []
|
911
931
|
for subtype_name in subtype_names:
|
912
932
|
lookup_paths.append(f"/{subtype_name}/{value}")
|
913
933
|
if not lookup_paths:
|
@@ -946,7 +966,7 @@ class Portal(PortalBase):
|
|
946
966
|
ref_lookup_strategy, ref_validator = (
|
947
967
|
self._ref_lookup_strategy(type_name, self.get_schema(type_name), value))
|
948
968
|
is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
|
949
|
-
subtype_names = self.
|
969
|
+
subtype_names = self._get_schema_subtype_names(type_name) if is_ref_lookup_subtypes else []
|
950
970
|
for type_name in [type_name] + subtype_names:
|
951
971
|
is_resolved, resolved_item = self._ref_exists_single_internally(type_name, value)
|
952
972
|
if is_resolved:
|
@@ -1008,7 +1028,7 @@ class Portal(PortalBase):
|
|
1008
1028
|
if not is_uuid(property_value):
|
1009
1029
|
return False
|
1010
1030
|
return True
|
1011
|
-
for schema_name in [type_name] + self.
|
1031
|
+
for schema_name in [type_name] + self._get_schema_subtype_names(type_name):
|
1012
1032
|
if schema := self.get_schema(schema_name):
|
1013
1033
|
if identifying_properties := schema.get("identifyingProperties"):
|
1014
1034
|
for identifying_property in identifying_properties:
|
@@ -1033,7 +1053,7 @@ class Portal(PortalBase):
|
|
1033
1053
|
|
1034
1054
|
def _cache_ref(self, type_name: str, value: str, resolved: List[str]) -> None:
|
1035
1055
|
if self._ref_cache is not None:
|
1036
|
-
subtype_names = self.
|
1056
|
+
subtype_names = self._get_schema_subtype_names(type_name)
|
1037
1057
|
for type_name in [type_name] + subtype_names:
|
1038
1058
|
self._ref_cache[f"/{type_name}/{value}"] = resolved
|
1039
1059
|
|
@@ -46,8 +46,8 @@ dcicutils/log_utils.py,sha256=7pWMc6vyrorUZQf-V-M3YC6zrPgNhuV_fzm9xqTPph0,10883
|
|
46
46
|
dcicutils/misc_utils.py,sha256=zVc4urdVGgnWjQ4UQlrGH-URAzr2l_PwZWI3u_GJdFE,102210
|
47
47
|
dcicutils/obfuscation_utils.py,sha256=fo2jOmDRC6xWpYX49u80bVNisqRRoPskFNX3ymFAmjw,5963
|
48
48
|
dcicutils/opensearch_utils.py,sha256=V2exmFYW8Xl2_pGFixF4I2Cc549Opwe4PhFi5twC0M8,1017
|
49
|
-
dcicutils/portal_object_utils.py,sha256=
|
50
|
-
dcicutils/portal_utils.py,sha256=
|
49
|
+
dcicutils/portal_object_utils.py,sha256=7gteQ5CM6IVDfHx-UPFiOfeE1fJYOir_uwWdRTykExQ,15374
|
50
|
+
dcicutils/portal_utils.py,sha256=trM8L9J1CPXntdsKSL56hy7SMpftNNpIReAn5iihGes,30050
|
51
51
|
dcicutils/project_utils.py,sha256=qPdCaFmWUVBJw4rw342iUytwdQC0P-XKpK4mhyIulMM,31250
|
52
52
|
dcicutils/qa_checkers.py,sha256=cdXjeL0jCDFDLT8VR8Px78aS10hwNISOO5G_Zv2TZ6M,20534
|
53
53
|
dcicutils/qa_utils.py,sha256=TT0SiJWiuxYvbsIyhK9VO4uV_suxhB6CpuC4qPacCzQ,160208
|
@@ -62,15 +62,15 @@ dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19
|
|
62
62
|
dcicutils/sheet_utils.py,sha256=VlmzteONW5VF_Q4vo0yA5vesz1ViUah1MZ_yA1rwZ0M,33629
|
63
63
|
dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
|
64
64
|
dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
|
65
|
-
dcicutils/structured_data.py,sha256=
|
65
|
+
dcicutils/structured_data.py,sha256=aXyLqYlgp5DInGiWw6WZk08vfvKI0IW2eWW57n0zhz0,57549
|
66
66
|
dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
|
67
67
|
dcicutils/tmpfile_utils.py,sha256=n95XF8dZVbQRSXBZTGToXXfSs3JUVRyN6c3ZZ0nhAWI,1403
|
68
68
|
dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
|
69
69
|
dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
|
70
70
|
dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
|
71
71
|
dcicutils/zip_utils.py,sha256=rnjNv_k6L9jT2SjDSgVXp4BEJYLtz9XN6Cl2Fy-tqnM,2027
|
72
|
-
dcicutils-8.8.0.
|
73
|
-
dcicutils-8.8.0.
|
74
|
-
dcicutils-8.8.0.
|
75
|
-
dcicutils-8.8.0.
|
76
|
-
dcicutils-8.8.0.
|
72
|
+
dcicutils-8.8.0.1b24.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
|
73
|
+
dcicutils-8.8.0.1b24.dist-info/METADATA,sha256=z5fS6VG0wLUCd3NTEvsMF5rdWwLg7TmWlTs1Syg-7Sk,3357
|
74
|
+
dcicutils-8.8.0.1b24.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
75
|
+
dcicutils-8.8.0.1b24.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
|
76
|
+
dcicutils-8.8.0.1b24.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|