dcicutils 8.8.0.1b22__py3-none-any.whl → 8.8.0.1b24__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- dcicutils/portal_object_utils.py +113 -92
- dcicutils/portal_utils.py +45 -0
- dcicutils/structured_data.py +69 -49
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/METADATA +1 -1
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/RECORD +8 -8
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/LICENSE.txt +0 -0
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/WHEEL +0 -0
- {dcicutils-8.8.0.1b22.dist-info → dcicutils-8.8.0.1b24.dist-info}/entry_points.txt +0 -0
dcicutils/portal_object_utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from copy import deepcopy
|
2
2
|
from functools import lru_cache
|
3
3
|
import re
|
4
|
-
from typing import Any, List, Optional, Tuple, Type, Union
|
4
|
+
from typing import Any, Callable, List, Optional, Tuple, Type, Union
|
5
5
|
from dcicutils.data_readers import RowReader
|
6
6
|
from dcicutils.misc_utils import create_readonly_object
|
7
7
|
from dcicutils.portal_utils import Portal
|
@@ -19,7 +19,7 @@ class PortalObject:
|
|
19
19
|
self._data = data if isinstance(data, dict) else {}
|
20
20
|
self._portal = portal if isinstance(portal, Portal) else None
|
21
21
|
self._schema = schema if isinstance(schema, dict) else (schema.data if isinstance(schema, Schema) else None)
|
22
|
-
self._type = type if isinstance(type, str)
|
22
|
+
self._type = type if isinstance(type, str) else ""
|
23
23
|
|
24
24
|
@property
|
25
25
|
def data(self) -> dict:
|
@@ -31,8 +31,8 @@ class PortalObject:
|
|
31
31
|
|
32
32
|
@property
|
33
33
|
@lru_cache(maxsize=1)
|
34
|
-
def type(self) ->
|
35
|
-
return self._type or Portal.get_schema_type(self._data) or (Schema(self._schema).type if self._schema else
|
34
|
+
def type(self) -> str:
|
35
|
+
return self._type or Portal.get_schema_type(self._data) or (Schema(self._schema).type if self._schema else "")
|
36
36
|
|
37
37
|
@property
|
38
38
|
@lru_cache(maxsize=1)
|
@@ -75,86 +75,42 @@ class PortalObject:
|
|
75
75
|
identifying_properties.append("aliases")
|
76
76
|
return identifying_properties or None
|
77
77
|
|
78
|
-
@
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
"""
|
84
|
-
identifying_paths = []
|
85
|
-
if not (identifying_properties := self.identifying_properties):
|
86
|
-
if self.uuid:
|
87
|
-
if self.type:
|
88
|
-
identifying_paths.append(f"/{self.type}/{self.uuid}")
|
89
|
-
identifying_paths.append(f"/{self.uuid}")
|
90
|
-
return identifying_paths
|
91
|
-
for identifying_property in identifying_properties:
|
92
|
-
if (identifying_value := self._data.get(identifying_property)):
|
93
|
-
if identifying_property == "uuid":
|
94
|
-
identifying_paths.append(f"/{self.type}/{identifying_value}")
|
95
|
-
identifying_paths.append(f"/{identifying_value}")
|
96
|
-
# For now at least we include the path both with and without the schema type component,
|
97
|
-
# as for some identifying values, it works (only) with, and some, it works (only) without.
|
98
|
-
# For example: If we have FileSet with "accession", an identifying property, with value
|
99
|
-
# SMAFSFXF1RO4 then /SMAFSFXF1RO4 works but /FileSet/SMAFSFXF1RO4 does not; and
|
100
|
-
# conversely using "submitted_id", also an identifying property, with value
|
101
|
-
# UW_FILE-SET_COLO-829BL_HI-C_1 then /UW_FILE-SET_COLO-829BL_HI-C_1 does
|
102
|
-
# not work but /FileSet/UW_FILE-SET_COLO-829BL_HI-C_1 does work.
|
103
|
-
elif isinstance(identifying_value, list):
|
104
|
-
for identifying_value_item in identifying_value:
|
105
|
-
if self.type:
|
106
|
-
identifying_paths.append(f"/{self.type}/{identifying_value_item}")
|
107
|
-
identifying_paths.append(f"/{identifying_value_item}")
|
108
|
-
else:
|
109
|
-
if (schema := self.schema):
|
110
|
-
if pattern := schema.get("properties", {}).get(identifying_property, {}).get("pattern"):
|
111
|
-
if not re.match(pattern, identifying_value):
|
112
|
-
# If this identifying value is for a (identifying) property which has a
|
113
|
-
# pattern, and the value does NOT match the pattern, then do NOT include
|
114
|
-
# this value as an identifying path, since it cannot possibly be found.
|
115
|
-
continue
|
116
|
-
if self.type:
|
117
|
-
identifying_paths.append(f"/{self.type}/{identifying_value}")
|
118
|
-
identifying_paths.append(f"/{identifying_value}")
|
119
|
-
return identifying_paths or None
|
120
|
-
|
121
|
-
@property
|
122
|
-
@lru_cache(maxsize=1)
|
123
|
-
def identifying_path(self) -> Optional[str]:
|
124
|
-
if identifying_paths := self.identifying_paths:
|
125
|
-
return identifying_paths[0]
|
126
|
-
|
127
|
-
def lookup(self, include_identifying_path: bool = False,
|
128
|
-
raw: bool = False) -> Optional[Union[Tuple[PortalObject, str], PortalObject]]:
|
129
|
-
return self._lookup(raw=raw) if include_identifying_path else self._lookup(raw=raw)[0]
|
130
|
-
|
131
|
-
def lookup_identifying_path(self) -> Optional[str]:
|
132
|
-
return self._lookup()[1]
|
133
|
-
|
134
|
-
def _lookup(self, raw: bool = False) -> Tuple[Optional[PortalObject], Optional[str]]:
|
78
|
+
@lru_cache(maxsize=8192)
|
79
|
+
def lookup(self, raw: bool = False,
|
80
|
+
ref_lookup_strategy: Optional[Callable] = None) -> Tuple[Optional[PortalObject], Optional[str], int]:
|
81
|
+
nlookups = 0
|
82
|
+
first_identifying_path = None
|
135
83
|
try:
|
136
|
-
if identifying_paths := self.
|
84
|
+
if identifying_paths := self._get_identifying_paths(ref_lookup_strategy=ref_lookup_strategy):
|
137
85
|
for identifying_path in identifying_paths:
|
86
|
+
if not first_identifying_path:
|
87
|
+
first_identifying_path = identifying_path
|
88
|
+
nlookups += 1
|
138
89
|
if (value := self._portal.get(identifying_path, raw=raw)) and (value.status_code == 200):
|
139
|
-
return
|
140
|
-
|
90
|
+
return (
|
91
|
+
PortalObject(value.json(), portal=self._portal, type=self.type if raw else None),
|
92
|
+
identifying_path,
|
93
|
+
nlookups
|
94
|
+
)
|
141
95
|
except Exception:
|
142
96
|
pass
|
143
|
-
return None,
|
97
|
+
return None, first_identifying_path, nlookups
|
144
98
|
|
145
99
|
def compare(self, value: Union[dict, PortalObject],
|
146
|
-
consider_refs: bool = False, resolved_refs: List[dict] = None) -> dict:
|
100
|
+
consider_refs: bool = False, resolved_refs: List[dict] = None) -> Tuple[dict, int]:
|
147
101
|
if consider_refs and isinstance(resolved_refs, list):
|
148
|
-
|
102
|
+
normlized_portal_object, nlookups = self._normalized_refs(refs=resolved_refs)
|
103
|
+
this_data = normlized_portal_object.data
|
149
104
|
else:
|
150
105
|
this_data = self.data
|
106
|
+
nlookups = 0
|
151
107
|
if isinstance(value, PortalObject):
|
152
108
|
comparing_data = value.data
|
153
109
|
elif isinstance(value, dict):
|
154
110
|
comparing_data = value
|
155
111
|
else:
|
156
|
-
return {}
|
157
|
-
return PortalObject._compare(this_data, comparing_data)
|
112
|
+
return {}, nlookups
|
113
|
+
return PortalObject._compare(this_data, comparing_data), nlookups
|
158
114
|
|
159
115
|
@staticmethod
|
160
116
|
def _compare(a: Any, b: Any, _path: Optional[str] = None) -> dict:
|
@@ -201,42 +157,106 @@ class PortalObject:
|
|
201
157
|
diffs[_path] = diff_updating(a, b)
|
202
158
|
return diffs
|
203
159
|
|
204
|
-
|
160
|
+
@lru_cache(maxsize=1)
|
161
|
+
def _get_identifying_paths(self, ref_lookup_strategy: Optional[Callable] = None) -> Optional[List[str]]:
|
205
162
|
"""
|
206
|
-
|
207
|
-
this Portal object into the uuid style reference (e.g. d1b67068-300f-483f-bfe8-63d23c93801f),
|
208
|
-
based on the given "refs" list which is assumed to be a list of dictionaries, where each
|
209
|
-
contains a "path" and a "uuid" property; this list is typically (for our first usage of
|
210
|
-
this function) the value of structured_data.StructuredDataSet.resolved_refs_with_uuid.
|
211
|
-
Changes are made to this Portal object in place; use normalized_refs function to make a copy.
|
212
|
-
If there are no "refs" (None or empty) or if the speicified reference is not found in this
|
213
|
-
list then the references will be looked up via Portal calls (via Portal.get_metadata).
|
163
|
+
Returns a list of the possible Portal URL paths identifying this Portal object.
|
214
164
|
"""
|
215
|
-
|
165
|
+
identifying_paths = []
|
166
|
+
if not (identifying_properties := self.identifying_properties):
|
167
|
+
if self.uuid:
|
168
|
+
if self.type:
|
169
|
+
identifying_paths.append(f"/{self.type}/{self.uuid}")
|
170
|
+
identifying_paths.append(f"/{self.uuid}")
|
171
|
+
return identifying_paths
|
172
|
+
for identifying_property in identifying_properties:
|
173
|
+
if identifying_value := self._data.get(identifying_property):
|
174
|
+
if identifying_property == "uuid":
|
175
|
+
if self.type:
|
176
|
+
identifying_paths.append(f"/{self.type}/{identifying_value}")
|
177
|
+
identifying_paths.append(f"/{identifying_value}")
|
178
|
+
# For now at least we include the path both with and without the schema type component,
|
179
|
+
# as for some identifying values, it works (only) with, and some, it works (only) without.
|
180
|
+
# For example: If we have FileSet with "accession", an identifying property, with value
|
181
|
+
# SMAFSFXF1RO4 then /SMAFSFXF1RO4 works but /FileSet/SMAFSFXF1RO4 does not; and
|
182
|
+
# conversely using "submitted_id", also an identifying property, with value
|
183
|
+
# UW_FILE-SET_COLO-829BL_HI-C_1 then /UW_FILE-SET_COLO-829BL_HI-C_1 does
|
184
|
+
# not work but /FileSet/UW_FILE-SET_COLO-829BL_HI-C_1 does work.
|
185
|
+
elif isinstance(identifying_value, list):
|
186
|
+
for identifying_value_item in identifying_value:
|
187
|
+
if self.type:
|
188
|
+
identifying_paths.append(f"/{self.type}/{identifying_value_item}")
|
189
|
+
identifying_paths.append(f"/{identifying_value_item}")
|
190
|
+
else:
|
191
|
+
# TODO: Import from somewhere ...
|
192
|
+
lookup_options = 0
|
193
|
+
if schema := self.schema:
|
194
|
+
# TODO: Hook into the ref_lookup_strategy thing in structured_data to make
|
195
|
+
# sure we check accession format (since it does not have a pattern).
|
196
|
+
if callable(ref_lookup_strategy):
|
197
|
+
lookup_options, ref_validator = ref_lookup_strategy(
|
198
|
+
self.type, schema, identifying_value)
|
199
|
+
if callable(ref_validator):
|
200
|
+
if ref_validator(schema, identifying_property, identifying_value) is False:
|
201
|
+
continue
|
202
|
+
if pattern := schema.get("properties", {}).get(identifying_property, {}).get("pattern"):
|
203
|
+
if not re.match(pattern, identifying_value):
|
204
|
+
# If this identifying value is for a (identifying) property which has a
|
205
|
+
# pattern, and the value does NOT match the pattern, then do NOT include
|
206
|
+
# this value as an identifying path, since it cannot possibly be found.
|
207
|
+
continue
|
208
|
+
if not lookup_options:
|
209
|
+
lookup_options = Portal.LOOKUP_DEFAULT
|
210
|
+
if Portal.is_lookup_root_first(lookup_options):
|
211
|
+
identifying_paths.append(f"/{identifying_value}")
|
212
|
+
if Portal.is_lookup_specified_type(lookup_options) and self.type:
|
213
|
+
identifying_paths.append(f"/{self.type}/{identifying_value}")
|
214
|
+
if Portal.is_lookup_root(lookup_options) and not Portal.is_lookup_root_first(lookup_options):
|
215
|
+
identifying_paths.append(f"/{identifying_value}")
|
216
|
+
if Portal.is_lookup_subtypes(lookup_options):
|
217
|
+
for subtype_name in self._portal.get_schema_subtype_names(self.type):
|
218
|
+
identifying_paths.append(f"/{subtype_name}/{identifying_value}")
|
219
|
+
return identifying_paths or None
|
216
220
|
|
217
|
-
def
|
221
|
+
def _normalized_refs(self, refs: List[dict]) -> Tuple[PortalObject, int]:
|
218
222
|
"""
|
219
|
-
Same as
|
223
|
+
Same as _normalize_ref but does NOT make this change to this Portal object IN PLACE,
|
220
224
|
rather it returns a new instance of this Portal object wrapped in a new PortalObject.
|
221
225
|
"""
|
222
226
|
portal_object = self.copy()
|
223
|
-
portal_object.
|
224
|
-
return portal_object
|
227
|
+
nlookups = portal_object._normalize_refs(refs)
|
228
|
+
return portal_object, nlookups
|
229
|
+
|
230
|
+
def _normalize_refs(self, refs: List[dict]) -> int:
|
231
|
+
"""
|
232
|
+
Turns any (linkTo) references which are paths (e.g. /SubmissionCenter/uwsc_gcc) within this
|
233
|
+
object IN PLACE into the uuid style reference (e.g. d1b67068-300f-483f-bfe8-63d23c93801f),
|
234
|
+
based on the given "refs" list which is assumed to be a list of dictionaries, where each
|
235
|
+
contains a "path" and a "uuid" property; this list is typically (for our first usage of
|
236
|
+
this function) the value of structured_data.StructuredDataSet.resolved_refs_with_uuid.
|
237
|
+
Changes are made to this Portal object IN PLACE; use _normalized_refs function to make a copy.
|
238
|
+
If there are no "refs" (None or empty) or if the speicified reference is not found in this
|
239
|
+
list then the references will be looked up via Portal calls (via Portal.get_metadata).
|
240
|
+
"""
|
241
|
+
_, nlookups = PortalObject._normalize_data_refs(self.data, refs=refs, schema=self.schema, portal=self.portal)
|
242
|
+
return nlookups
|
225
243
|
|
226
244
|
@staticmethod
|
227
|
-
def
|
245
|
+
def _normalize_data_refs(value: Any, refs: List[dict], schema: dict,
|
246
|
+
portal: Portal, _path: Optional[str] = None) -> Tuple[Any, int]:
|
247
|
+
nlookups = 0
|
228
248
|
if not value or not isinstance(schema, dict):
|
229
|
-
return value
|
249
|
+
return value, nlookups
|
230
250
|
if isinstance(value, dict):
|
231
251
|
for key in value:
|
232
252
|
path = f"{_path}.{key}" if _path else key
|
233
|
-
value[key] = PortalObject.
|
234
|
-
|
253
|
+
value[key], nlookups = PortalObject._normalize_data_refs(value[key], refs=refs,
|
254
|
+
schema=schema, portal=portal, _path=path)
|
235
255
|
elif isinstance(value, list):
|
236
256
|
for index in range(len(value)):
|
237
257
|
path = f"{_path or ''}#{index}"
|
238
|
-
value[index] = PortalObject.
|
239
|
-
|
258
|
+
value[index], nlookups = PortalObject._normalize_data_refs(value[index], refs=refs,
|
259
|
+
schema=schema, portal=portal, _path=path)
|
240
260
|
elif value_type := Schema.get_property_by_path(schema, _path):
|
241
261
|
if link_to := value_type.get("linkTo"):
|
242
262
|
ref_path = f"/{link_to}/{value}"
|
@@ -247,7 +267,7 @@ class PortalObject:
|
|
247
267
|
else:
|
248
268
|
ref_uuid = None
|
249
269
|
if ref_uuid:
|
250
|
-
return ref_uuid
|
270
|
+
return ref_uuid, nlookups
|
251
271
|
# Here our (linkTo) reference appears not to be in the given refs; if these refs came
|
252
272
|
# from structured_data.StructuredDataSet.resolved_refs_with_uuid (in the context of
|
253
273
|
# smaht-submitr, which is the typical/first use case for this function) then this could
|
@@ -255,6 +275,7 @@ class PortalObject:
|
|
255
275
|
# the data/spreadsheet being submitted. In any case, we don't have the associated uuid
|
256
276
|
# so let us look it up here.
|
257
277
|
if isinstance(portal, Portal):
|
278
|
+
nlookups += 1
|
258
279
|
if (ref_object := portal.get_metadata(ref_path)) and (ref_uuid := ref_object.get("uuid")):
|
259
|
-
return ref_uuid
|
260
|
-
return value
|
280
|
+
return ref_uuid, nlookups
|
281
|
+
return value, nlookups
|
dcicutils/portal_utils.py
CHANGED
@@ -46,6 +46,22 @@ class Portal:
|
|
46
46
|
KEYS_FILE_DIRECTORY = "~"
|
47
47
|
MIME_TYPE_JSON = "application/json"
|
48
48
|
|
49
|
+
# Object lookup strategies; on a per-reference (type/value) basis, used currently ONLY by
|
50
|
+
# structured_data.py; controlled by an optional ref_lookup_strategy callable; default is
|
51
|
+
# lookup at root path but after the specified type path lookup, and then lookup all subtypes;
|
52
|
+
# can choose to lookup root path first, or not lookup root path at all, or not lookup
|
53
|
+
# subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
|
54
|
+
# and value (string) arguements and return an integer of any of the below ORed together.
|
55
|
+
# The main purpose of this is optimization; to minimize portal lookups; since for example,
|
56
|
+
# currently at least, /{type}/{accession} does not work but /{accession} does; so we
|
57
|
+
# currently (smaht-portal/.../ingestion_processors) use LOOKUP_ROOT_FIRST for this.
|
58
|
+
# And current usage NEVER has LOOKUP_SUBTYPES turned OFF; but support just in case.
|
59
|
+
LOOKUP_SPECIFIED_TYPE = 0x0001
|
60
|
+
LOOKUP_ROOT = 0x0002
|
61
|
+
LOOKUP_ROOT_FIRST = 0x0004 | LOOKUP_ROOT
|
62
|
+
LOOKUP_SUBTYPES = 0x0008
|
63
|
+
LOOKUP_DEFAULT = LOOKUP_SPECIFIED_TYPE | LOOKUP_ROOT | LOOKUP_SUBTYPES
|
64
|
+
|
49
65
|
def __init__(self,
|
50
66
|
arg: Optional[Union[Portal, TestApp, VirtualApp, PyramidRouter, dict, tuple, str]] = None,
|
51
67
|
env: Optional[str] = None, server: Optional[str] = None,
|
@@ -188,9 +204,27 @@ class Portal:
|
|
188
204
|
def vapp(self) -> Optional[TestApp]:
|
189
205
|
return self._vapp
|
190
206
|
|
207
|
+
@staticmethod
|
208
|
+
def is_lookup_specified_type(lookup_options: int) -> bool:
|
209
|
+
return (lookup_options &
|
210
|
+
Portal.LOOKUP_SPECIFIED_TYPE) == Portal.LOOKUP_SPECIFIED_TYPE
|
211
|
+
|
212
|
+
@staticmethod
|
213
|
+
def is_lookup_root(lookup_options: int) -> bool:
|
214
|
+
return (lookup_options & Portal.LOOKUP_ROOT) == Portal.LOOKUP_ROOT
|
215
|
+
|
216
|
+
@staticmethod
|
217
|
+
def is_lookup_root_first(lookup_options: int) -> bool:
|
218
|
+
return (lookup_options & Portal.LOOKUP_ROOT_FIRST) == Portal.LOOKUP_ROOT_FIRST
|
219
|
+
|
220
|
+
@staticmethod
|
221
|
+
def is_lookup_subtypes(lookup_options: int) -> bool:
|
222
|
+
return (lookup_options & Portal.LOOKUP_SUBTYPES) == Portal.LOOKUP_SUBTYPES
|
223
|
+
|
191
224
|
def get(self, url: str, follow: bool = True,
|
192
225
|
raw: bool = False, database: bool = False, raise_for_status: bool = False, **kwargs) -> OptionalResponse:
|
193
226
|
url = self.url(url, raw, database)
|
227
|
+
# print(f'xyzzy.portal.get({url})')
|
194
228
|
if not self.vapp:
|
195
229
|
response = requests.get(url, allow_redirects=follow, **self._kwargs(**kwargs))
|
196
230
|
else:
|
@@ -205,6 +239,7 @@ class Portal:
|
|
205
239
|
def patch(self, url: str, data: Optional[dict] = None, json: Optional[dict] = None,
|
206
240
|
raise_for_status: bool = False, **kwargs) -> OptionalResponse:
|
207
241
|
url = self.url(url)
|
242
|
+
# print(f'xyzzy.portal.patch({url})')
|
208
243
|
if not self.vapp:
|
209
244
|
response = requests.patch(url, data=data, json=json, **self._kwargs(**kwargs))
|
210
245
|
else:
|
@@ -217,6 +252,7 @@ class Portal:
|
|
217
252
|
def post(self, url: str, data: Optional[dict] = None, json: Optional[dict] = None, files: Optional[dict] = None,
|
218
253
|
raise_for_status: bool = False, **kwargs) -> OptionalResponse:
|
219
254
|
url = self.url(url)
|
255
|
+
# print(f'xyzzy.portal.post({url})')
|
220
256
|
if files and not ("headers" in kwargs):
|
221
257
|
# Setting headers to None when using files implies content-type multipart/form-data.
|
222
258
|
kwargs["headers"] = None
|
@@ -233,6 +269,7 @@ class Portal:
|
|
233
269
|
return response
|
234
270
|
|
235
271
|
def get_metadata(self, object_id: str, raw: bool = False, database: bool = False) -> Optional[dict]:
|
272
|
+
# print(f'xyzzy.portal.get_metadata({object_id})')
|
236
273
|
if isinstance(raw, bool) and raw:
|
237
274
|
add_on = "frame=raw" + ("&datastore=database" if isinstance(database, bool) and database else "")
|
238
275
|
elif database:
|
@@ -242,11 +279,13 @@ class Portal:
|
|
242
279
|
return get_metadata(obj_id=object_id, vapp=self.vapp, key=self.key, add_on=add_on)
|
243
280
|
|
244
281
|
def patch_metadata(self, object_id: str, data: dict) -> Optional[dict]:
|
282
|
+
# print(f'xyzzy.portal.patch_metadata({object_id})')
|
245
283
|
if self.key:
|
246
284
|
return patch_metadata(obj_id=object_id, patch_item=data, key=self.key)
|
247
285
|
return self.patch(f"/{object_id}", data).json()
|
248
286
|
|
249
287
|
def post_metadata(self, object_type: str, data: dict) -> Optional[dict]:
|
288
|
+
# print(f'xyzzy.portal.post_metadata({object_id})')
|
250
289
|
if self.key:
|
251
290
|
return post_metadata(schema_name=object_type, post_item=data, key=self.key)
|
252
291
|
return self.post(f"/{object_type}", data).json()
|
@@ -358,6 +397,12 @@ class Portal:
|
|
358
397
|
super_type_map_flattened[super_type_name] = list_breadth_first(super_type_map, super_type_name)
|
359
398
|
return super_type_map_flattened
|
360
399
|
|
400
|
+
@lru_cache(maxsize=64)
|
401
|
+
def get_schema_subtype_names(self, type_name: str) -> List[str]:
|
402
|
+
if not (schemas_super_type_map := self.get_schemas_super_type_map()):
|
403
|
+
return []
|
404
|
+
return schemas_super_type_map.get(type_name, [])
|
405
|
+
|
361
406
|
def url(self, url: str, raw: bool = False, database: bool = False) -> str:
|
362
407
|
if not isinstance(url, str) or not url:
|
363
408
|
return "/"
|
dcicutils/structured_data.py
CHANGED
@@ -48,22 +48,6 @@ StructuredDataSet = Type["StructuredDataSet"]
|
|
48
48
|
|
49
49
|
class StructuredDataSet:
|
50
50
|
|
51
|
-
# Reference (linkTo) lookup strategies; on a per-reference (type/value) basis;
|
52
|
-
# controlled by optional ref_lookup_strategy callable; default is lookup at root path
|
53
|
-
# but after the named reference (linkTo) type path lookup, and then lookup all subtypes;
|
54
|
-
# can choose to lookup root path first, or not lookup root path at all, or not lookup
|
55
|
-
# subtypes at all; the ref_lookup_strategy callable if specified should take a type_name
|
56
|
-
# and value (string) arguements and return an integer of any of the below ORed together.
|
57
|
-
# The main purpose of this is optimization; to minimize portal lookups; since for example,
|
58
|
-
# currently at least, /{type}/{accession} does not work but /{accession} does; so we
|
59
|
-
# currently (smaht-portal/.../ingestion_processors) use REF_LOOKUP_ROOT_FIRST for this.
|
60
|
-
# And current usage NEVER has REF_LOOKUP_SUBTYPES turned OFF; but support just in case.
|
61
|
-
REF_LOOKUP_SPECIFIED_TYPE = 0x0001
|
62
|
-
REF_LOOKUP_ROOT = 0x0002
|
63
|
-
REF_LOOKUP_ROOT_FIRST = 0x0004 | REF_LOOKUP_ROOT
|
64
|
-
REF_LOOKUP_SUBTYPES = 0x0008
|
65
|
-
REF_LOOKUP_DEFAULT = REF_LOOKUP_SPECIFIED_TYPE | REF_LOOKUP_ROOT | REF_LOOKUP_SUBTYPES
|
66
|
-
|
67
51
|
def __init__(self, file: Optional[str] = None, portal: Optional[Union[VirtualApp, TestApp, Portal]] = None,
|
68
52
|
schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None,
|
69
53
|
order: Optional[List[str]] = None, prune: bool = True,
|
@@ -76,6 +60,7 @@ class StructuredDataSet:
|
|
76
60
|
self._portal = Portal(portal, data=self._data, schemas=schemas,
|
77
61
|
ref_lookup_strategy=ref_lookup_strategy,
|
78
62
|
ref_lookup_nocache=ref_lookup_nocache) if portal else None
|
63
|
+
self._ref_lookup_strategy = ref_lookup_strategy
|
79
64
|
self._order = order
|
80
65
|
self._prune = prune
|
81
66
|
self._warnings = {}
|
@@ -199,25 +184,49 @@ class StructuredDataSet:
|
|
199
184
|
upload_file["path"] = file_path
|
200
185
|
return upload_files
|
201
186
|
|
202
|
-
def compare(self) -> dict:
|
187
|
+
def compare(self, progress: Optional[Callable] = None) -> dict:
|
188
|
+
def get_counts() -> int:
|
189
|
+
ntypes = 0
|
190
|
+
nobjects = 0
|
191
|
+
if self.data:
|
192
|
+
ntypes = len(self.data)
|
193
|
+
for type_name in self.data:
|
194
|
+
nobjects += len(self.data[type_name])
|
195
|
+
return ntypes, nobjects
|
203
196
|
diffs = {}
|
204
|
-
if
|
197
|
+
if callable(progress):
|
198
|
+
ntypes, nobjects = get_counts()
|
199
|
+
progress({"start": True, "types": ntypes, "objects": nobjects})
|
200
|
+
if self.data or self.portal: # TODO: what is this OR biz?
|
205
201
|
refs = self.resolved_refs_with_uuids
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
202
|
+
# TODO: Need feedback/progress tracking mechanism here.
|
203
|
+
# TODO: Check validity of reference; actually check that earlier on even maybe.
|
204
|
+
for type_name in self.data:
|
205
|
+
if not diffs.get(type_name):
|
206
|
+
diffs[type_name] = []
|
207
|
+
for portal_object in self.data[type_name]:
|
208
|
+
portal_object = PortalObject(portal_object, portal=self.portal, type=type_name)
|
209
|
+
existing_object, identifying_path, nlookups = (
|
210
|
+
portal_object.lookup(raw=True, ref_lookup_strategy=self._ref_lookup_strategy))
|
212
211
|
if existing_object:
|
213
|
-
object_diffs = portal_object.compare(
|
214
|
-
|
215
|
-
|
216
|
-
|
212
|
+
object_diffs, nlookups_compare = portal_object.compare(
|
213
|
+
existing_object, consider_refs=True, resolved_refs=refs)
|
214
|
+
diffs[type_name].append(create_readonly_object(path=identifying_path,
|
215
|
+
uuid=existing_object.uuid,
|
216
|
+
diffs=object_diffs or None))
|
217
|
+
if callable(progress):
|
218
|
+
progress({"update": True, "lookups": nlookups + nlookups_compare})
|
217
219
|
elif identifying_path:
|
218
220
|
# If there is no existing object we still create a record for this object
|
219
221
|
# but with no uuid which will be the indication that it does not exist.
|
220
|
-
diffs[
|
222
|
+
diffs[type_name].append(create_readonly_object(path=identifying_path, uuid=None, diffs=None))
|
223
|
+
if callable(progress):
|
224
|
+
progress({"create": True, "lookups": nlookups})
|
225
|
+
else:
|
226
|
+
if callable(progress):
|
227
|
+
progress({"lookups": nlookups})
|
228
|
+
if callable(progress):
|
229
|
+
progress({"finish": True})
|
221
230
|
return diffs
|
222
231
|
|
223
232
|
def _load_file(self, file: str) -> None:
|
@@ -251,7 +260,7 @@ class StructuredDataSet:
|
|
251
260
|
self._load_reader(CsvReader(file), type_name=Schema.type_name(file))
|
252
261
|
|
253
262
|
def _load_excel_file(self, file: str) -> None:
|
254
|
-
def
|
263
|
+
def get_counts() -> Tuple[int, int]:
|
255
264
|
nonlocal file
|
256
265
|
excel = Excel(file)
|
257
266
|
nrows = 0
|
@@ -260,18 +269,27 @@ class StructuredDataSet:
|
|
260
269
|
nrows += 1
|
261
270
|
return nrows, len(excel.sheet_names)
|
262
271
|
if self._progress:
|
263
|
-
|
272
|
+
nrows, nsheets = get_counts()
|
273
|
+
self._progress({"start": True, "sheets": nsheets, "rows": nrows})
|
274
|
+
"""
|
275
|
+
if self._progress:
|
276
|
+
self._progress_update(get_counts)
|
277
|
+
"""
|
264
278
|
excel = Excel(file) # Order the sheet names by any specified ordering (e.g. ala snovault.loadxl).
|
265
279
|
order = {Schema.type_name(key): index for index, key in enumerate(self._order)} if self._order else {}
|
266
280
|
for sheet_name in sorted(excel.sheet_names, key=lambda key: order.get(Schema.type_name(key), sys.maxsize)):
|
267
281
|
self._load_reader(excel.sheet_reader(sheet_name), type_name=Schema.type_name(sheet_name))
|
282
|
+
if self._progress:
|
283
|
+
self._progress({"finish": True})
|
284
|
+
# TODO: Do we really need progress reporting for the below?
|
268
285
|
# Check for unresolved reference errors which really are not because of ordering.
|
269
286
|
# Yes such internal references will be handled correctly on actual database update via snovault.loadxl.
|
270
287
|
if ref_errors := self.ref_errors:
|
271
288
|
ref_errors_actual = []
|
272
289
|
for ref_error in ref_errors:
|
273
290
|
if not (resolved := self.portal.ref_exists(ref := ref_error["error"])):
|
274
|
-
# if
|
291
|
+
# TODO: Probably do this instead; and if so then no progress needed (per question above).
|
292
|
+
# if not (resolved := self.portal.ref_exists_internally(ref := ref_error["error"])):
|
275
293
|
ref_errors_actual.append(ref_error)
|
276
294
|
else:
|
277
295
|
self._resolved_refs.add((ref, resolved.get("uuid")))
|
@@ -304,13 +322,15 @@ class StructuredDataSet:
|
|
304
322
|
self._add_properties(structured_row, self._autoadd_properties, schema)
|
305
323
|
self._add(type_name, structured_row)
|
306
324
|
if self._progress:
|
307
|
-
self.
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
325
|
+
self._progress({
|
326
|
+
"parse": True,
|
327
|
+
"refs": self.ref_total_count,
|
328
|
+
"refs_found": self.ref_total_found_count,
|
329
|
+
"refs_not_found": self.ref_total_notfound_count,
|
330
|
+
"refs_lookup": self.ref_lookup_count,
|
331
|
+
"refs_cache_hit": self.ref_lookup_cache_hit_count,
|
332
|
+
"refs_invalid": self.ref_invalid_identifying_property_count
|
333
|
+
})
|
314
334
|
self._note_warning(reader.warnings, "reader")
|
315
335
|
if schema:
|
316
336
|
self._note_error(schema._unresolved_refs, "ref")
|
@@ -331,16 +351,16 @@ class StructuredDataSet:
|
|
331
351
|
|
332
352
|
def _is_ref_lookup_specified_type(ref_lookup_flags: int) -> bool:
|
333
353
|
return (ref_lookup_flags &
|
334
|
-
|
354
|
+
Portal.LOOKUP_SPECIFIED_TYPE) == Portal.LOOKUP_SPECIFIED_TYPE
|
335
355
|
|
336
356
|
def _is_ref_lookup_root(ref_lookup_flags: int) -> bool:
|
337
|
-
return (ref_lookup_flags &
|
357
|
+
return (ref_lookup_flags & Portal.LOOKUP_ROOT) == Portal.LOOKUP_ROOT
|
338
358
|
|
339
359
|
def _is_ref_lookup_root_first(ref_lookup_flags: int) -> bool:
|
340
|
-
return (ref_lookup_flags &
|
360
|
+
return (ref_lookup_flags & Portal.LOOKUP_ROOT_FIRST) == Portal.LOOKUP_ROOT_FIRST
|
341
361
|
|
342
362
|
def _is_ref_lookup_subtypes(ref_lookup_flags: int) -> bool:
|
343
|
-
return (ref_lookup_flags &
|
363
|
+
return (ref_lookup_flags & Portal.LOOKUP_SUBTYPES) == Portal.LOOKUP_SUBTYPES
|
344
364
|
|
345
365
|
@property
|
346
366
|
def ref_total_count(self) -> int:
|
@@ -786,7 +806,7 @@ class Portal(PortalBase):
|
|
786
806
|
if callable(ref_lookup_strategy):
|
787
807
|
self._ref_lookup_strategy = ref_lookup_strategy
|
788
808
|
else:
|
789
|
-
self._ref_lookup_strategy = lambda type_name, schema, value: (
|
809
|
+
self._ref_lookup_strategy = lambda type_name, schema, value: (Portal.LOOKUP_DEFAULT, None)
|
790
810
|
if ref_lookup_nocache is True:
|
791
811
|
self.ref_lookup = self.ref_lookup_uncached
|
792
812
|
self._ref_cache = None
|
@@ -844,7 +864,7 @@ class Portal(PortalBase):
|
|
844
864
|
return schemas
|
845
865
|
|
846
866
|
@lru_cache(maxsize=64)
|
847
|
-
def
|
867
|
+
def _get_schema_subtype_names(self, type_name: str) -> List[str]:
|
848
868
|
if not (schemas_super_type_map := self.get_schemas_super_type_map()):
|
849
869
|
return []
|
850
870
|
return schemas_super_type_map.get(type_name, [])
|
@@ -907,7 +927,7 @@ class Portal(PortalBase):
|
|
907
927
|
lookup_paths.append(f"/{type_name}/{value}")
|
908
928
|
if is_ref_lookup_root and not is_ref_lookup_root_first:
|
909
929
|
lookup_paths.append(f"/{value}")
|
910
|
-
subtype_names = self.
|
930
|
+
subtype_names = self._get_schema_subtype_names(type_name) if is_ref_lookup_subtypes else []
|
911
931
|
for subtype_name in subtype_names:
|
912
932
|
lookup_paths.append(f"/{subtype_name}/{value}")
|
913
933
|
if not lookup_paths:
|
@@ -946,7 +966,7 @@ class Portal(PortalBase):
|
|
946
966
|
ref_lookup_strategy, ref_validator = (
|
947
967
|
self._ref_lookup_strategy(type_name, self.get_schema(type_name), value))
|
948
968
|
is_ref_lookup_subtypes = StructuredDataSet._is_ref_lookup_subtypes(ref_lookup_strategy)
|
949
|
-
subtype_names = self.
|
969
|
+
subtype_names = self._get_schema_subtype_names(type_name) if is_ref_lookup_subtypes else []
|
950
970
|
for type_name in [type_name] + subtype_names:
|
951
971
|
is_resolved, resolved_item = self._ref_exists_single_internally(type_name, value)
|
952
972
|
if is_resolved:
|
@@ -1008,7 +1028,7 @@ class Portal(PortalBase):
|
|
1008
1028
|
if not is_uuid(property_value):
|
1009
1029
|
return False
|
1010
1030
|
return True
|
1011
|
-
for schema_name in [type_name] + self.
|
1031
|
+
for schema_name in [type_name] + self._get_schema_subtype_names(type_name):
|
1012
1032
|
if schema := self.get_schema(schema_name):
|
1013
1033
|
if identifying_properties := schema.get("identifyingProperties"):
|
1014
1034
|
for identifying_property in identifying_properties:
|
@@ -1033,7 +1053,7 @@ class Portal(PortalBase):
|
|
1033
1053
|
|
1034
1054
|
def _cache_ref(self, type_name: str, value: str, resolved: List[str]) -> None:
|
1035
1055
|
if self._ref_cache is not None:
|
1036
|
-
subtype_names = self.
|
1056
|
+
subtype_names = self._get_schema_subtype_names(type_name)
|
1037
1057
|
for type_name in [type_name] + subtype_names:
|
1038
1058
|
self._ref_cache[f"/{type_name}/{value}"] = resolved
|
1039
1059
|
|
@@ -46,8 +46,8 @@ dcicutils/log_utils.py,sha256=7pWMc6vyrorUZQf-V-M3YC6zrPgNhuV_fzm9xqTPph0,10883
|
|
46
46
|
dcicutils/misc_utils.py,sha256=zVc4urdVGgnWjQ4UQlrGH-URAzr2l_PwZWI3u_GJdFE,102210
|
47
47
|
dcicutils/obfuscation_utils.py,sha256=fo2jOmDRC6xWpYX49u80bVNisqRRoPskFNX3ymFAmjw,5963
|
48
48
|
dcicutils/opensearch_utils.py,sha256=V2exmFYW8Xl2_pGFixF4I2Cc549Opwe4PhFi5twC0M8,1017
|
49
|
-
dcicutils/portal_object_utils.py,sha256=
|
50
|
-
dcicutils/portal_utils.py,sha256=
|
49
|
+
dcicutils/portal_object_utils.py,sha256=7gteQ5CM6IVDfHx-UPFiOfeE1fJYOir_uwWdRTykExQ,15374
|
50
|
+
dcicutils/portal_utils.py,sha256=trM8L9J1CPXntdsKSL56hy7SMpftNNpIReAn5iihGes,30050
|
51
51
|
dcicutils/project_utils.py,sha256=qPdCaFmWUVBJw4rw342iUytwdQC0P-XKpK4mhyIulMM,31250
|
52
52
|
dcicutils/qa_checkers.py,sha256=cdXjeL0jCDFDLT8VR8Px78aS10hwNISOO5G_Zv2TZ6M,20534
|
53
53
|
dcicutils/qa_utils.py,sha256=TT0SiJWiuxYvbsIyhK9VO4uV_suxhB6CpuC4qPacCzQ,160208
|
@@ -62,15 +62,15 @@ dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19
|
|
62
62
|
dcicutils/sheet_utils.py,sha256=VlmzteONW5VF_Q4vo0yA5vesz1ViUah1MZ_yA1rwZ0M,33629
|
63
63
|
dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
|
64
64
|
dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
|
65
|
-
dcicutils/structured_data.py,sha256=
|
65
|
+
dcicutils/structured_data.py,sha256=aXyLqYlgp5DInGiWw6WZk08vfvKI0IW2eWW57n0zhz0,57549
|
66
66
|
dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
|
67
67
|
dcicutils/tmpfile_utils.py,sha256=n95XF8dZVbQRSXBZTGToXXfSs3JUVRyN6c3ZZ0nhAWI,1403
|
68
68
|
dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
|
69
69
|
dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
|
70
70
|
dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
|
71
71
|
dcicutils/zip_utils.py,sha256=rnjNv_k6L9jT2SjDSgVXp4BEJYLtz9XN6Cl2Fy-tqnM,2027
|
72
|
-
dcicutils-8.8.0.
|
73
|
-
dcicutils-8.8.0.
|
74
|
-
dcicutils-8.8.0.
|
75
|
-
dcicutils-8.8.0.
|
76
|
-
dcicutils-8.8.0.
|
72
|
+
dcicutils-8.8.0.1b24.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
|
73
|
+
dcicutils-8.8.0.1b24.dist-info/METADATA,sha256=z5fS6VG0wLUCd3NTEvsMF5rdWwLg7TmWlTs1Syg-7Sk,3357
|
74
|
+
dcicutils-8.8.0.1b24.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
75
|
+
dcicutils-8.8.0.1b24.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
|
76
|
+
dcicutils-8.8.0.1b24.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|