dcicutils 8.8.1.1b7__py3-none-any.whl → 8.8.1.1b9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- dcicutils/progress_constants.py +48 -0
- dcicutils/structured_data.py +30 -60
- {dcicutils-8.8.1.1b7.dist-info → dcicutils-8.8.1.1b9.dist-info}/METADATA +1 -1
- {dcicutils-8.8.1.1b7.dist-info → dcicutils-8.8.1.1b9.dist-info}/RECORD +7 -6
- {dcicutils-8.8.1.1b7.dist-info → dcicutils-8.8.1.1b9.dist-info}/LICENSE.txt +0 -0
- {dcicutils-8.8.1.1b7.dist-info → dcicutils-8.8.1.1b9.dist-info}/WHEEL +0 -0
- {dcicutils-8.8.1.1b7.dist-info → dcicutils-8.8.1.1b9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
|
3
|
+
|
4
|
+
# Constants for progress tracking for smaht-submitr.
|
5
|
+
# Here only to share between smaht-portal, snovault, and smaht-submitr.
|
6
|
+
|
7
|
+
class PROGRESS_INGESTER(Enum):
|
8
|
+
VALIDATION = "ingester_validation"
|
9
|
+
INITIATE = "ingester_initiate"
|
10
|
+
|
11
|
+
|
12
|
+
class PROGRESS_PARSE(Enum):
|
13
|
+
LOAD_START = "start"
|
14
|
+
LOAD_ITEM = "parse"
|
15
|
+
LOAD_DONE = "finish"
|
16
|
+
LOAD_COUNT_SHEETS = "sheets"
|
17
|
+
LOAD_COUNT_ROWS = "rows"
|
18
|
+
LOAD_COUNT_REFS = "refs"
|
19
|
+
LOAD_COUNT_REFS_FOUND = "refs_found"
|
20
|
+
LOAD_COUNT_REFS_NOT_FOUND = "refs_not_found"
|
21
|
+
LOAD_COUNT_REFS_LOOKUP = "refs_lookup"
|
22
|
+
LOAD_COUNT_REFS_LOOKUP_CACHE_HIT = "refs_lookup_cache_hit"
|
23
|
+
LOAD_COUNT_REFS_EXISTS_CACHE_HIT = "refs_exists_cache_hit"
|
24
|
+
LOAD_COUNT_REFS_INVALID = "refs_invalid"
|
25
|
+
ANALYZE_START = "start"
|
26
|
+
ANALYZE_COUNT_TYPES = "types"
|
27
|
+
ANALYZE_COUNT_ITEMS = "objects"
|
28
|
+
ANALYZE_CREATE = "create"
|
29
|
+
ANALYZE_COUNT_LOOKUP = "lookups"
|
30
|
+
ANALYZE_UPDATE = "update"
|
31
|
+
ANALYZE_DONE = "finish"
|
32
|
+
|
33
|
+
|
34
|
+
class PROGRESS_LOADXL(Enum):
|
35
|
+
INITIATE = "loadxl_initiate"
|
36
|
+
START = "loadxl_start"
|
37
|
+
START_SECOND_ROUND = "loadxl_start_second_round"
|
38
|
+
ITEM = "loadxl_item"
|
39
|
+
ITEM_SECOND_ROUND = "loadxl_item_second_round"
|
40
|
+
GET = "loadxl_lookup"
|
41
|
+
POST = "loadxl_post"
|
42
|
+
PATCH = "loadxl_patch"
|
43
|
+
ERROR = "loadxl_error"
|
44
|
+
DONE = "loadxl_done"
|
45
|
+
TOTAL = "loadxl_total"
|
46
|
+
MESSAGE = "loadxl_message"
|
47
|
+
MESSAGE_VERBOSE = "loadxl_message_verbose"
|
48
|
+
MESSAGE_DEBUG = "loadxl_message_debug"
|
dcicutils/structured_data.py
CHANGED
@@ -17,6 +17,7 @@ from dcicutils.misc_utils import (create_dict, create_readonly_object, is_uuid,
|
|
17
17
|
to_boolean, to_enum, to_float, to_integer, VirtualApp)
|
18
18
|
from dcicutils.portal_object_utils import PortalObject
|
19
19
|
from dcicutils.portal_utils import Portal as PortalBase
|
20
|
+
from dcicutils.progress_constants import PROGRESS_PARSE as PROGRESS
|
20
21
|
from dcicutils.schema_utils import Schema as SchemaBase
|
21
22
|
from dcicutils.zip_utils import unpack_gz_file_to_temporary_file, unpack_files
|
22
23
|
|
@@ -37,31 +38,10 @@ ARRAY_NAME_SUFFIX_CHAR = "#"
|
|
37
38
|
ARRAY_NAME_SUFFIX_REGEX = re.compile(rf"{ARRAY_NAME_SUFFIX_CHAR}\d+")
|
38
39
|
DOTTED_NAME_DELIMITER_CHAR = "."
|
39
40
|
|
41
|
+
|
40
42
|
# TODO: Should probably pass this knowledge in from callers.
|
41
43
|
FILE_TYPE_NAME = "File"
|
42
44
|
FILE_TYPE_PROPERTY_NAME = "filename"
|
43
|
-
# This ExtraFile is a pseudo-type to handle extra_files in smaht-submitr.
|
44
|
-
EXTRA_FILE_TYPE_NAME = "ExtraFile"
|
45
|
-
EXTRA_FILE_TYPE_PROPERTY_NAME = "extra_files"
|
46
|
-
|
47
|
-
ENABLE_ARRAY_SHEET_REFS = False
|
48
|
-
|
49
|
-
# The ExtraFile pseudo-type schema.
|
50
|
-
EXTRA_FILE_SCHEMA = {
|
51
|
-
"title": "ExtraFile",
|
52
|
-
"type": "object",
|
53
|
-
"required": [
|
54
|
-
"filename"
|
55
|
-
],
|
56
|
-
"identifyingProperties": [
|
57
|
-
"filename"
|
58
|
-
],
|
59
|
-
"properties": {
|
60
|
-
"filename": {
|
61
|
-
"type": "string"
|
62
|
-
}
|
63
|
-
}
|
64
|
-
}
|
65
45
|
|
66
46
|
# Forward type references for type hints.
|
67
47
|
Portal = Type["Portal"]
|
@@ -212,7 +192,8 @@ class StructuredDataSet:
|
|
212
192
|
diffs = {}
|
213
193
|
if callable(progress):
|
214
194
|
ntypes, nobjects = get_counts()
|
215
|
-
progress({
|
195
|
+
progress({PROGRESS.ANALYZE_START: True,
|
196
|
+
PROGRESS.ANALYZE_COUNT_TYPES: ntypes, PROGRESS.ANALYZE_COUNT_ITEMS: nobjects})
|
216
197
|
if self.data or self.portal: # TODO: what is this OR biz?
|
217
198
|
refs = self.resolved_refs_with_uuids
|
218
199
|
# TODO: Need feedback/progress tracking mechanism here.
|
@@ -231,18 +212,19 @@ class StructuredDataSet:
|
|
231
212
|
uuid=existing_object.uuid,
|
232
213
|
diffs=object_diffs or None))
|
233
214
|
if callable(progress):
|
234
|
-
progress({
|
215
|
+
progress({PROGRESS.ANALYZE_UPDATE: True,
|
216
|
+
PROGRESS.ANALYZE_COUNT_LOOKUP: nlookups + nlookups_compare})
|
235
217
|
elif identifying_path:
|
236
218
|
# If there is no existing object we still create a record for this object
|
237
219
|
# but with no uuid which will be the indication that it does not exist.
|
238
220
|
diffs[type_name].append(create_readonly_object(path=identifying_path, uuid=None, diffs=None))
|
239
221
|
if callable(progress):
|
240
|
-
progress({
|
222
|
+
progress({PROGRESS.ANALYZE_CREATE: True, PROGRESS.ANALYZE_COUNT_LOOKUP: nlookups})
|
241
223
|
else:
|
242
224
|
if callable(progress):
|
243
|
-
progress({
|
225
|
+
progress({PROGRESS.ANALYZE_COUNT_LOOKUP: nlookups})
|
244
226
|
if callable(progress):
|
245
|
-
progress({
|
227
|
+
progress({PROGRESS.ANALYZE_DONE: True})
|
246
228
|
return diffs
|
247
229
|
|
248
230
|
def load_file(self, file: str) -> None:
|
@@ -287,9 +269,10 @@ class StructuredDataSet:
|
|
287
269
|
for row in excel.sheet_reader(sheet_name):
|
288
270
|
nrows += 1
|
289
271
|
return nrows, len(excel.sheet_names)
|
290
|
-
if self._progress:
|
272
|
+
if self._progress: # TODO: Move to _load_reader
|
291
273
|
nrows, nsheets = get_counts()
|
292
|
-
self._progress({
|
274
|
+
self._progress({PROGRESS.LOAD_START: True,
|
275
|
+
PROGRESS.LOAD_COUNT_SHEETS: nsheets, PROGRESS.LOAD_COUNT_ROWS: nrows})
|
293
276
|
excel = Excel(file) # Order the sheet names by any specified ordering (e.g. ala snovault.loadxl).
|
294
277
|
order = {Schema.type_name(key): index for index, key in enumerate(self._order)} if self._order else {}
|
295
278
|
for sheet_name in sorted(excel.sheet_names, key=lambda key: order.get(Schema.type_name(key), sys.maxsize)):
|
@@ -313,16 +296,15 @@ class StructuredDataSet:
|
|
313
296
|
else:
|
314
297
|
del self._errors["ref"]
|
315
298
|
if self._progress:
|
316
|
-
# TODO: Refactor with same thing below in _load_reader.
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
"refs_invalid": self.ref_invalid_identifying_property_count
|
299
|
+
self._progress({ # TODO: Refactor with same thing below in _load_reader.
|
300
|
+
PROGRESS.LOAD_DONE: True,
|
301
|
+
PROGRESS.LOAD_COUNT_REFS: self.ref_total_count,
|
302
|
+
PROGRESS.LOAD_COUNT_REFS_FOUND: self.ref_total_found_count,
|
303
|
+
PROGRESS.LOAD_COUNT_REFS_NOT_FOUND: self.ref_total_notfound_count,
|
304
|
+
PROGRESS.LOAD_COUNT_REFS_LOOKUP: self.ref_lookup_count,
|
305
|
+
PROGRESS.LOAD_COUNT_REFS_LOOKUP_CACHE_HIT: self.ref_lookup_cache_hit_count,
|
306
|
+
PROGRESS.LOAD_COUNT_REFS_EXISTS_CACHE_HIT: self.ref_exists_cache_hit_count,
|
307
|
+
PROGRESS.LOAD_COUNT_REFS_INVALID: self.ref_invalid_identifying_property_count
|
326
308
|
})
|
327
309
|
|
328
310
|
def _load_json_file(self, file: str) -> None:
|
@@ -352,14 +334,14 @@ class StructuredDataSet:
|
|
352
334
|
self._add(type_name, structured_row)
|
353
335
|
if self._progress:
|
354
336
|
self._progress({
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
337
|
+
PROGRESS.LOAD_ITEM: True,
|
338
|
+
PROGRESS.LOAD_COUNT_REFS: self.ref_total_count,
|
339
|
+
PROGRESS.LOAD_COUNT_REFS_FOUND: self.ref_total_found_count,
|
340
|
+
PROGRESS.LOAD_COUNT_REFS_NOT_FOUND: self.ref_total_notfound_count,
|
341
|
+
PROGRESS.LOAD_COUNT_REFS_LOOKUP: self.ref_lookup_count,
|
342
|
+
PROGRESS.LOAD_COUNT_REFS_LOOKUP_CACHE_HIT: self.ref_lookup_cache_hit_count,
|
343
|
+
PROGRESS.LOAD_COUNT_REFS_EXISTS_CACHE_HIT: self.ref_exists_cache_hit_count,
|
344
|
+
PROGRESS.LOAD_COUNT_REFS_INVALID: self.ref_invalid_identifying_property_count
|
363
345
|
})
|
364
346
|
self._note_warning(reader.warnings, "reader")
|
365
347
|
if schema:
|
@@ -465,12 +447,10 @@ class StructuredDataSet:
|
|
465
447
|
|
466
448
|
class _StructuredRowTemplate:
|
467
449
|
|
468
|
-
def __init__(self, column_names: List[str], schema: Optional[Schema] = None
|
469
|
-
obtain_array_values: Optional[Callable] = None) -> None:
|
450
|
+
def __init__(self, column_names: List[str], schema: Optional[Schema] = None) -> None:
|
470
451
|
self._schema = schema
|
471
452
|
self._set_value_functions = {}
|
472
453
|
self._template = self._create_row_template(column_names)
|
473
|
-
self._obtain_array_values = obtain_array_values if callable(obtain_array_values) else None
|
474
454
|
|
475
455
|
def create_row(self) -> dict:
|
476
456
|
return copy.deepcopy(self._template)
|
@@ -530,16 +510,6 @@ class _StructuredRowTemplate:
|
|
530
510
|
set_value_backtrack_object(i, p)
|
531
511
|
data = data[p]
|
532
512
|
if (p := path[-1]) == -1 and isinstance(value, str):
|
533
|
-
if ENABLE_ARRAY_SHEET_REFS and False:
|
534
|
-
# TODO: IN PROGRESS. DISABLED FOR NOW.
|
535
|
-
if isinstance(value, str) and value.lower().startswith("[ref:") and value.endswith("]"):
|
536
|
-
if self._obtain_array_values:
|
537
|
-
values = self._obtain_array_values(value)
|
538
|
-
if sheet_name_containing_array := value[5:].strip():
|
539
|
-
if dot := sheet_name_containing_array.find(".") > 0:
|
540
|
-
if sheet_name_containing_array := sheet_name_containing_array[0:dot].strip():
|
541
|
-
pass
|
542
|
-
# sheet_column_containing_array = sheet_name_containing_array[dot + 1:].strip()
|
543
513
|
values = _split_array_string(value, unique=typeinfo.get("unique") if typeinfo else False)
|
544
514
|
if mapv:
|
545
515
|
values = [mapv(value, src) for value in values]
|
@@ -48,6 +48,7 @@ dcicutils/obfuscation_utils.py,sha256=fo2jOmDRC6xWpYX49u80bVNisqRRoPskFNX3ymFAmj
|
|
48
48
|
dcicutils/opensearch_utils.py,sha256=V2exmFYW8Xl2_pGFixF4I2Cc549Opwe4PhFi5twC0M8,1017
|
49
49
|
dcicutils/portal_object_utils.py,sha256=MF6MTZ6yxakZFDjbkTKCsF4q4p11dLDVvT5JBV9m6RQ,15408
|
50
50
|
dcicutils/portal_utils.py,sha256=oBoI3KWRp6YrbsuVGbmPQ3kATB5cVVsQo7-qmnYXWqg,30260
|
51
|
+
dcicutils/progress_constants.py,sha256=Q5ZzXYQXi6QMIYnUi_vxDAEH-nTYjQVauc9HPfvk5jE,1475
|
51
52
|
dcicutils/project_utils.py,sha256=qPdCaFmWUVBJw4rw342iUytwdQC0P-XKpK4mhyIulMM,31250
|
52
53
|
dcicutils/qa_checkers.py,sha256=cdXjeL0jCDFDLT8VR8Px78aS10hwNISOO5G_Zv2TZ6M,20534
|
53
54
|
dcicutils/qa_utils.py,sha256=TT0SiJWiuxYvbsIyhK9VO4uV_suxhB6CpuC4qPacCzQ,160208
|
@@ -62,15 +63,15 @@ dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19
|
|
62
63
|
dcicutils/sheet_utils.py,sha256=VlmzteONW5VF_Q4vo0yA5vesz1ViUah1MZ_yA1rwZ0M,33629
|
63
64
|
dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
|
64
65
|
dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
|
65
|
-
dcicutils/structured_data.py,sha256=
|
66
|
+
dcicutils/structured_data.py,sha256=1guVNDzIVxJkQA_m0jSh9xI2FB5oVXR4m7sqrqF8A5w,58559
|
66
67
|
dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
|
67
68
|
dcicutils/tmpfile_utils.py,sha256=n95XF8dZVbQRSXBZTGToXXfSs3JUVRyN6c3ZZ0nhAWI,1403
|
68
69
|
dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
|
69
70
|
dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
|
70
71
|
dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
|
71
72
|
dcicutils/zip_utils.py,sha256=rnjNv_k6L9jT2SjDSgVXp4BEJYLtz9XN6Cl2Fy-tqnM,2027
|
72
|
-
dcicutils-8.8.1.
|
73
|
-
dcicutils-8.8.1.
|
74
|
-
dcicutils-8.8.1.
|
75
|
-
dcicutils-8.8.1.
|
76
|
-
dcicutils-8.8.1.
|
73
|
+
dcicutils-8.8.1.1b9.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
|
74
|
+
dcicutils-8.8.1.1b9.dist-info/METADATA,sha256=PpSJ-JtZqnTWFk4eeZbU3RnCfRXko6sCYafK2wtmFW0,3356
|
75
|
+
dcicutils-8.8.1.1b9.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
76
|
+
dcicutils-8.8.1.1b9.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
|
77
|
+
dcicutils-8.8.1.1b9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|