dcicutils 8.8.1.1b6__py3-none-any.whl → 8.8.1.1b9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcicutils/progress_constants.py +48 -0
- dcicutils/structured_data.py +32 -61
- {dcicutils-8.8.1.1b6.dist-info → dcicutils-8.8.1.1b9.dist-info}/METADATA +1 -1
- {dcicutils-8.8.1.1b6.dist-info → dcicutils-8.8.1.1b9.dist-info}/RECORD +7 -6
- {dcicutils-8.8.1.1b6.dist-info → dcicutils-8.8.1.1b9.dist-info}/LICENSE.txt +0 -0
- {dcicutils-8.8.1.1b6.dist-info → dcicutils-8.8.1.1b9.dist-info}/WHEEL +0 -0
- {dcicutils-8.8.1.1b6.dist-info → dcicutils-8.8.1.1b9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
|
3
|
+
|
4
|
+
# Constants for progress tracking for smaht-submitr.
|
5
|
+
# Here only to share between smaht-portal, snovault, and smaht-submitr.
|
6
|
+
|
7
|
+
class PROGRESS_INGESTER(Enum):
|
8
|
+
VALIDATION = "ingester_validation"
|
9
|
+
INITIATE = "ingester_initiate"
|
10
|
+
|
11
|
+
|
12
|
+
class PROGRESS_PARSE(Enum):
|
13
|
+
LOAD_START = "start"
|
14
|
+
LOAD_ITEM = "parse"
|
15
|
+
LOAD_DONE = "finish"
|
16
|
+
LOAD_COUNT_SHEETS = "sheets"
|
17
|
+
LOAD_COUNT_ROWS = "rows"
|
18
|
+
LOAD_COUNT_REFS = "refs"
|
19
|
+
LOAD_COUNT_REFS_FOUND = "refs_found"
|
20
|
+
LOAD_COUNT_REFS_NOT_FOUND = "refs_not_found"
|
21
|
+
LOAD_COUNT_REFS_LOOKUP = "refs_lookup"
|
22
|
+
LOAD_COUNT_REFS_LOOKUP_CACHE_HIT = "refs_lookup_cache_hit"
|
23
|
+
LOAD_COUNT_REFS_EXISTS_CACHE_HIT = "refs_exists_cache_hit"
|
24
|
+
LOAD_COUNT_REFS_INVALID = "refs_invalid"
|
25
|
+
ANALYZE_START = "start"
|
26
|
+
ANALYZE_COUNT_TYPES = "types"
|
27
|
+
ANALYZE_COUNT_ITEMS = "objects"
|
28
|
+
ANALYZE_CREATE = "create"
|
29
|
+
ANALYZE_COUNT_LOOKUP = "lookups"
|
30
|
+
ANALYZE_UPDATE = "update"
|
31
|
+
ANALYZE_DONE = "finish"
|
32
|
+
|
33
|
+
|
34
|
+
class PROGRESS_LOADXL(Enum):
|
35
|
+
INITIATE = "loadxl_initiate"
|
36
|
+
START = "loadxl_start"
|
37
|
+
START_SECOND_ROUND = "loadxl_start_second_round"
|
38
|
+
ITEM = "loadxl_item"
|
39
|
+
ITEM_SECOND_ROUND = "loadxl_item_second_round"
|
40
|
+
GET = "loadxl_lookup"
|
41
|
+
POST = "loadxl_post"
|
42
|
+
PATCH = "loadxl_patch"
|
43
|
+
ERROR = "loadxl_error"
|
44
|
+
DONE = "loadxl_done"
|
45
|
+
TOTAL = "loadxl_total"
|
46
|
+
MESSAGE = "loadxl_message"
|
47
|
+
MESSAGE_VERBOSE = "loadxl_message_verbose"
|
48
|
+
MESSAGE_DEBUG = "loadxl_message_debug"
|
dcicutils/structured_data.py
CHANGED
@@ -17,6 +17,7 @@ from dcicutils.misc_utils import (create_dict, create_readonly_object, is_uuid,
|
|
17
17
|
to_boolean, to_enum, to_float, to_integer, VirtualApp)
|
18
18
|
from dcicutils.portal_object_utils import PortalObject
|
19
19
|
from dcicutils.portal_utils import Portal as PortalBase
|
20
|
+
from dcicutils.progress_constants import PROGRESS_PARSE as PROGRESS
|
20
21
|
from dcicutils.schema_utils import Schema as SchemaBase
|
21
22
|
from dcicutils.zip_utils import unpack_gz_file_to_temporary_file, unpack_files
|
22
23
|
|
@@ -37,31 +38,10 @@ ARRAY_NAME_SUFFIX_CHAR = "#"
|
|
37
38
|
ARRAY_NAME_SUFFIX_REGEX = re.compile(rf"{ARRAY_NAME_SUFFIX_CHAR}\d+")
|
38
39
|
DOTTED_NAME_DELIMITER_CHAR = "."
|
39
40
|
|
41
|
+
|
40
42
|
# TODO: Should probably pass this knowledge in from callers.
|
41
43
|
FILE_TYPE_NAME = "File"
|
42
44
|
FILE_TYPE_PROPERTY_NAME = "filename"
|
43
|
-
# This ExtraFile is a pseudo-type to handle extra_files in smaht-submitr.
|
44
|
-
EXTRA_FILE_TYPE_NAME = "ExtraFile"
|
45
|
-
EXTRA_FILE_TYPE_PROPERTY_NAME = "extra_files"
|
46
|
-
|
47
|
-
ENABLE_ARRAY_SHEET_REFS = False
|
48
|
-
|
49
|
-
# The ExtraFile pseudo-type schema.
|
50
|
-
EXTRA_FILE_SCHEMA = {
|
51
|
-
"title": "ExtraFile",
|
52
|
-
"type": "object",
|
53
|
-
"required": [
|
54
|
-
"filename"
|
55
|
-
],
|
56
|
-
"identifyingProperties": [
|
57
|
-
"filename"
|
58
|
-
],
|
59
|
-
"properties": {
|
60
|
-
"filename": {
|
61
|
-
"type": "string"
|
62
|
-
}
|
63
|
-
}
|
64
|
-
}
|
65
45
|
|
66
46
|
# Forward type references for type hints.
|
67
47
|
Portal = Type["Portal"]
|
@@ -117,10 +97,11 @@ class StructuredDataSet:
|
|
117
97
|
ref_lookup_strategy: Optional[Callable] = None,
|
118
98
|
ref_lookup_nocache: bool = False,
|
119
99
|
norefs: bool = False,
|
100
|
+
progress: Optional[Callable] = None,
|
120
101
|
debug_sleep: Optional[str] = None) -> StructuredDataSet:
|
121
102
|
return StructuredDataSet(file=file, portal=portal, schemas=schemas, autoadd=autoadd, order=order, prune=prune,
|
122
103
|
ref_lookup_strategy=ref_lookup_strategy, ref_lookup_nocache=ref_lookup_nocache,
|
123
|
-
norefs=norefs, debug_sleep=debug_sleep)
|
104
|
+
norefs=norefs, progress=progress, debug_sleep=debug_sleep)
|
124
105
|
|
125
106
|
def validate(self, force: bool = False) -> None:
|
126
107
|
def data_without_deleted_properties(data: dict) -> dict:
|
@@ -211,7 +192,8 @@ class StructuredDataSet:
|
|
211
192
|
diffs = {}
|
212
193
|
if callable(progress):
|
213
194
|
ntypes, nobjects = get_counts()
|
214
|
-
progress({
|
195
|
+
progress({PROGRESS.ANALYZE_START: True,
|
196
|
+
PROGRESS.ANALYZE_COUNT_TYPES: ntypes, PROGRESS.ANALYZE_COUNT_ITEMS: nobjects})
|
215
197
|
if self.data or self.portal: # TODO: what is this OR biz?
|
216
198
|
refs = self.resolved_refs_with_uuids
|
217
199
|
# TODO: Need feedback/progress tracking mechanism here.
|
@@ -230,18 +212,19 @@ class StructuredDataSet:
|
|
230
212
|
uuid=existing_object.uuid,
|
231
213
|
diffs=object_diffs or None))
|
232
214
|
if callable(progress):
|
233
|
-
progress({
|
215
|
+
progress({PROGRESS.ANALYZE_UPDATE: True,
|
216
|
+
PROGRESS.ANALYZE_COUNT_LOOKUP: nlookups + nlookups_compare})
|
234
217
|
elif identifying_path:
|
235
218
|
# If there is no existing object we still create a record for this object
|
236
219
|
# but with no uuid which will be the indication that it does not exist.
|
237
220
|
diffs[type_name].append(create_readonly_object(path=identifying_path, uuid=None, diffs=None))
|
238
221
|
if callable(progress):
|
239
|
-
progress({
|
222
|
+
progress({PROGRESS.ANALYZE_CREATE: True, PROGRESS.ANALYZE_COUNT_LOOKUP: nlookups})
|
240
223
|
else:
|
241
224
|
if callable(progress):
|
242
|
-
progress({
|
225
|
+
progress({PROGRESS.ANALYZE_COUNT_LOOKUP: nlookups})
|
243
226
|
if callable(progress):
|
244
|
-
progress({
|
227
|
+
progress({PROGRESS.ANALYZE_DONE: True})
|
245
228
|
return diffs
|
246
229
|
|
247
230
|
def load_file(self, file: str) -> None:
|
@@ -286,9 +269,10 @@ class StructuredDataSet:
|
|
286
269
|
for row in excel.sheet_reader(sheet_name):
|
287
270
|
nrows += 1
|
288
271
|
return nrows, len(excel.sheet_names)
|
289
|
-
if self._progress:
|
272
|
+
if self._progress: # TODO: Move to _load_reader
|
290
273
|
nrows, nsheets = get_counts()
|
291
|
-
self._progress({
|
274
|
+
self._progress({PROGRESS.LOAD_START: True,
|
275
|
+
PROGRESS.LOAD_COUNT_SHEETS: nsheets, PROGRESS.LOAD_COUNT_ROWS: nrows})
|
292
276
|
excel = Excel(file) # Order the sheet names by any specified ordering (e.g. ala snovault.loadxl).
|
293
277
|
order = {Schema.type_name(key): index for index, key in enumerate(self._order)} if self._order else {}
|
294
278
|
for sheet_name in sorted(excel.sheet_names, key=lambda key: order.get(Schema.type_name(key), sys.maxsize)):
|
@@ -312,16 +296,15 @@ class StructuredDataSet:
|
|
312
296
|
else:
|
313
297
|
del self._errors["ref"]
|
314
298
|
if self._progress:
|
315
|
-
# TODO: Refactor with same thing below in _load_reader.
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
"refs_invalid": self.ref_invalid_identifying_property_count
|
299
|
+
self._progress({ # TODO: Refactor with same thing below in _load_reader.
|
300
|
+
PROGRESS.LOAD_DONE: True,
|
301
|
+
PROGRESS.LOAD_COUNT_REFS: self.ref_total_count,
|
302
|
+
PROGRESS.LOAD_COUNT_REFS_FOUND: self.ref_total_found_count,
|
303
|
+
PROGRESS.LOAD_COUNT_REFS_NOT_FOUND: self.ref_total_notfound_count,
|
304
|
+
PROGRESS.LOAD_COUNT_REFS_LOOKUP: self.ref_lookup_count,
|
305
|
+
PROGRESS.LOAD_COUNT_REFS_LOOKUP_CACHE_HIT: self.ref_lookup_cache_hit_count,
|
306
|
+
PROGRESS.LOAD_COUNT_REFS_EXISTS_CACHE_HIT: self.ref_exists_cache_hit_count,
|
307
|
+
PROGRESS.LOAD_COUNT_REFS_INVALID: self.ref_invalid_identifying_property_count
|
325
308
|
})
|
326
309
|
|
327
310
|
def _load_json_file(self, file: str) -> None:
|
@@ -351,14 +334,14 @@ class StructuredDataSet:
|
|
351
334
|
self._add(type_name, structured_row)
|
352
335
|
if self._progress:
|
353
336
|
self._progress({
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
337
|
+
PROGRESS.LOAD_ITEM: True,
|
338
|
+
PROGRESS.LOAD_COUNT_REFS: self.ref_total_count,
|
339
|
+
PROGRESS.LOAD_COUNT_REFS_FOUND: self.ref_total_found_count,
|
340
|
+
PROGRESS.LOAD_COUNT_REFS_NOT_FOUND: self.ref_total_notfound_count,
|
341
|
+
PROGRESS.LOAD_COUNT_REFS_LOOKUP: self.ref_lookup_count,
|
342
|
+
PROGRESS.LOAD_COUNT_REFS_LOOKUP_CACHE_HIT: self.ref_lookup_cache_hit_count,
|
343
|
+
PROGRESS.LOAD_COUNT_REFS_EXISTS_CACHE_HIT: self.ref_exists_cache_hit_count,
|
344
|
+
PROGRESS.LOAD_COUNT_REFS_INVALID: self.ref_invalid_identifying_property_count
|
362
345
|
})
|
363
346
|
self._note_warning(reader.warnings, "reader")
|
364
347
|
if schema:
|
@@ -464,12 +447,10 @@ class StructuredDataSet:
|
|
464
447
|
|
465
448
|
class _StructuredRowTemplate:
|
466
449
|
|
467
|
-
def __init__(self, column_names: List[str], schema: Optional[Schema] = None
|
468
|
-
obtain_array_values: Optional[Callable] = None) -> None:
|
450
|
+
def __init__(self, column_names: List[str], schema: Optional[Schema] = None) -> None:
|
469
451
|
self._schema = schema
|
470
452
|
self._set_value_functions = {}
|
471
453
|
self._template = self._create_row_template(column_names)
|
472
|
-
self._obtain_array_values = obtain_array_values if callable(obtain_array_values) else None
|
473
454
|
|
474
455
|
def create_row(self) -> dict:
|
475
456
|
return copy.deepcopy(self._template)
|
@@ -529,16 +510,6 @@ class _StructuredRowTemplate:
|
|
529
510
|
set_value_backtrack_object(i, p)
|
530
511
|
data = data[p]
|
531
512
|
if (p := path[-1]) == -1 and isinstance(value, str):
|
532
|
-
if ENABLE_ARRAY_SHEET_REFS and False:
|
533
|
-
# TODO: IN PROGRESS. DISABLED FOR NOW.
|
534
|
-
if isinstance(value, str) and value.lower().startswith("[ref:") and value.endswith("]"):
|
535
|
-
if self._obtain_array_values:
|
536
|
-
values = self._obtain_array_values(value)
|
537
|
-
if sheet_name_containing_array := value[5:].strip():
|
538
|
-
if dot := sheet_name_containing_array.find(".") > 0:
|
539
|
-
if sheet_name_containing_array := sheet_name_containing_array[0:dot].strip():
|
540
|
-
pass
|
541
|
-
# sheet_column_containing_array = sheet_name_containing_array[dot + 1:].strip()
|
542
513
|
values = _split_array_string(value, unique=typeinfo.get("unique") if typeinfo else False)
|
543
514
|
if mapv:
|
544
515
|
values = [mapv(value, src) for value in values]
|
@@ -48,6 +48,7 @@ dcicutils/obfuscation_utils.py,sha256=fo2jOmDRC6xWpYX49u80bVNisqRRoPskFNX3ymFAmj
|
|
48
48
|
dcicutils/opensearch_utils.py,sha256=V2exmFYW8Xl2_pGFixF4I2Cc549Opwe4PhFi5twC0M8,1017
|
49
49
|
dcicutils/portal_object_utils.py,sha256=MF6MTZ6yxakZFDjbkTKCsF4q4p11dLDVvT5JBV9m6RQ,15408
|
50
50
|
dcicutils/portal_utils.py,sha256=oBoI3KWRp6YrbsuVGbmPQ3kATB5cVVsQo7-qmnYXWqg,30260
|
51
|
+
dcicutils/progress_constants.py,sha256=Q5ZzXYQXi6QMIYnUi_vxDAEH-nTYjQVauc9HPfvk5jE,1475
|
51
52
|
dcicutils/project_utils.py,sha256=qPdCaFmWUVBJw4rw342iUytwdQC0P-XKpK4mhyIulMM,31250
|
52
53
|
dcicutils/qa_checkers.py,sha256=cdXjeL0jCDFDLT8VR8Px78aS10hwNISOO5G_Zv2TZ6M,20534
|
53
54
|
dcicutils/qa_utils.py,sha256=TT0SiJWiuxYvbsIyhK9VO4uV_suxhB6CpuC4qPacCzQ,160208
|
@@ -62,15 +63,15 @@ dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19
|
|
62
63
|
dcicutils/sheet_utils.py,sha256=VlmzteONW5VF_Q4vo0yA5vesz1ViUah1MZ_yA1rwZ0M,33629
|
63
64
|
dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
|
64
65
|
dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
|
65
|
-
dcicutils/structured_data.py,sha256=
|
66
|
+
dcicutils/structured_data.py,sha256=1guVNDzIVxJkQA_m0jSh9xI2FB5oVXR4m7sqrqF8A5w,58559
|
66
67
|
dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
|
67
68
|
dcicutils/tmpfile_utils.py,sha256=n95XF8dZVbQRSXBZTGToXXfSs3JUVRyN6c3ZZ0nhAWI,1403
|
68
69
|
dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
|
69
70
|
dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
|
70
71
|
dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
|
71
72
|
dcicutils/zip_utils.py,sha256=rnjNv_k6L9jT2SjDSgVXp4BEJYLtz9XN6Cl2Fy-tqnM,2027
|
72
|
-
dcicutils-8.8.1.
|
73
|
-
dcicutils-8.8.1.
|
74
|
-
dcicutils-8.8.1.
|
75
|
-
dcicutils-8.8.1.
|
76
|
-
dcicutils-8.8.1.
|
73
|
+
dcicutils-8.8.1.1b9.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
|
74
|
+
dcicutils-8.8.1.1b9.dist-info/METADATA,sha256=PpSJ-JtZqnTWFk4eeZbU3RnCfRXko6sCYafK2wtmFW0,3356
|
75
|
+
dcicutils-8.8.1.1b9.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
76
|
+
dcicutils-8.8.1.1b9.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
|
77
|
+
dcicutils-8.8.1.1b9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|