metameq 2026.1.1__tar.gz → 2026.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {metameq-2026.1.1/metameq.egg-info → metameq-2026.2.1}/PKG-INFO +1 -1
- {metameq-2026.1.1 → metameq-2026.2.1}/environment.yml +1 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/_version.py +3 -3
- metameq-2026.2.1/metameq/config/config.yml +7 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_configurator.py +146 -1
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_extender.py +92 -42
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/util.py +2 -0
- metameq-2026.2.1/metameq/tests/test_metadata_configurator.py +4870 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_metadata_extender.py +1801 -126
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_metadata_merger.py +1 -1
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_util.py +1 -1
- {metameq-2026.1.1 → metameq-2026.2.1/metameq.egg-info}/PKG-INFO +1 -1
- metameq-2026.1.1/metameq/config/config.yml +0 -3
- metameq-2026.1.1/metameq/tests/test_metadata_configurator.py +0 -2334
- {metameq-2026.1.1 → metameq-2026.2.1}/.gitattributes +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/.github/workflows/main.yaml +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/.gitignore +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/README.md +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/assets/metameq.png +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/assets/metameq_dark.svg +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/assets/metameq_light.svg +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/assets/metameq_medium.png +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/__init__.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/config/__init__.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/config/standards.yml +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/__init__.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/__main__.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_merger.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_transformers.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_validator.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/__init__.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/data/invalid.yml +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/data/test_config.yml +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_metadata_transformers.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_metadata_validator.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/SOURCES.txt +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/dependency_links.txt +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/entry_points.txt +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/requires.txt +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/top_level.txt +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/setup.cfg +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/setup.py +0 -0
- {metameq-2026.1.1 → metameq-2026.2.1}/versioneer.py +0 -0
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2026-
|
|
11
|
+
"date": "2026-02-01T21:32:06-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "2026.
|
|
14
|
+
"full-revisionid": "87171fd73f555e2c03a15fa36ed9b5a912b824e9",
|
|
15
|
+
"version": "2026.02.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -4,7 +4,8 @@ from metameq.src.util import extract_config_dict, extract_stds_config, \
|
|
|
4
4
|
METADATA_FIELDS_KEY, STUDY_SPECIFIC_METADATA_KEY, \
|
|
5
5
|
HOST_TYPE_SPECIFIC_METADATA_KEY, \
|
|
6
6
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY, ALIAS_KEY, BASE_TYPE_KEY, \
|
|
7
|
-
DEFAULT_KEY, ALLOWED_KEY, ANYOF_KEY, TYPE_KEY
|
|
7
|
+
DEFAULT_KEY, ALLOWED_KEY, ANYOF_KEY, TYPE_KEY, \
|
|
8
|
+
SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
def combine_stds_and_study_config(
|
|
@@ -89,6 +90,15 @@ def flatten_nested_stds_dict(
|
|
|
89
90
|
if curr_host_type_sub_host_dict:
|
|
90
91
|
wip_host_types_dict.update(curr_host_type_sub_host_dict)
|
|
91
92
|
|
|
93
|
+
# resolve aliases and base types for this host's sample types
|
|
94
|
+
# This happens AFTER recursion so children inherit unresolved aliases,
|
|
95
|
+
# ensuring correct bottom-up resolution order
|
|
96
|
+
if SAMPLE_TYPE_SPECIFIC_METADATA_KEY in curr_host_type_wip_flat_dict:
|
|
97
|
+
curr_host_type_wip_flat_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY] = \
|
|
98
|
+
_resolve_sample_type_aliases_and_bases(
|
|
99
|
+
curr_host_type_wip_flat_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY],
|
|
100
|
+
curr_host_type_wip_flat_dict.get(METADATA_FIELDS_KEY, {}))
|
|
101
|
+
|
|
92
102
|
# assign the flattened wip dict for the current host type to the result
|
|
93
103
|
# (which now contains flat records for the hosts lower down than
|
|
94
104
|
# this, if there are any)
|
|
@@ -270,8 +280,11 @@ def _combine_base_and_added_host_type(
|
|
|
270
280
|
_combine_base_and_added_sample_type_specific_metadata(
|
|
271
281
|
host_type_wip_nested_dict,
|
|
272
282
|
host_type_add_dict)
|
|
283
|
+
|
|
273
284
|
# if we got back a non-empty dictionary of sample types,
|
|
274
285
|
# add it to the wip for this host type dict
|
|
286
|
+
# Note: resolution of aliases/base types happens in flatten_nested_stds_dict
|
|
287
|
+
# AFTER recursion, to ensure correct bottom-up resolution order
|
|
275
288
|
if curr_host_wip_sample_types_dict:
|
|
276
289
|
host_type_wip_nested_dict[
|
|
277
290
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY] = \
|
|
@@ -450,6 +463,130 @@ def _id_sample_type_definition(sample_type_name: str, sample_type_dict: Dict[str
|
|
|
450
463
|
"the same sample type dict")
|
|
451
464
|
|
|
452
465
|
|
|
466
|
+
def _construct_sample_type_metadata_fields_dict(
|
|
467
|
+
sample_type: str,
|
|
468
|
+
host_sample_types_config_dict: Dict[str, Any],
|
|
469
|
+
a_host_type_metadata_fields_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
470
|
+
"""Construct metadata fields dictionary for a specific host+sample type, resolving aliases and base types.
|
|
471
|
+
|
|
472
|
+
Parameters
|
|
473
|
+
----------
|
|
474
|
+
sample_type : str
|
|
475
|
+
The sample type to process.
|
|
476
|
+
host_sample_types_config_dict : Dict[str, Any]
|
|
477
|
+
Dictionary containing config for *all* sample types in
|
|
478
|
+
the host type in question.
|
|
479
|
+
a_host_type_metadata_fields_dict : Dict[str, Any]
|
|
480
|
+
Dictionary containing metadata fields for the host type in question.
|
|
481
|
+
|
|
482
|
+
Returns
|
|
483
|
+
-------
|
|
484
|
+
Dict[str, Any]
|
|
485
|
+
The constructed metadata fields dictionary for this host-and-sample-type combination.
|
|
486
|
+
|
|
487
|
+
Raises
|
|
488
|
+
------
|
|
489
|
+
ValueError
|
|
490
|
+
If there are invalid alias chains or base type configurations.
|
|
491
|
+
"""
|
|
492
|
+
sample_type_for_metadata = sample_type
|
|
493
|
+
|
|
494
|
+
# get dict associated with the naive sample type
|
|
495
|
+
sample_type_specific_dict = \
|
|
496
|
+
host_sample_types_config_dict[sample_type]
|
|
497
|
+
|
|
498
|
+
# if naive sample type contains an alias
|
|
499
|
+
sample_type_alias = sample_type_specific_dict.get(ALIAS_KEY)
|
|
500
|
+
if sample_type_alias:
|
|
501
|
+
# change the sample type to the alias sample type
|
|
502
|
+
# and use the alias's sample type dict
|
|
503
|
+
sample_type_for_metadata = sample_type_alias
|
|
504
|
+
sample_type_specific_dict = \
|
|
505
|
+
host_sample_types_config_dict[sample_type_alias]
|
|
506
|
+
if METADATA_FIELDS_KEY not in sample_type_specific_dict:
|
|
507
|
+
raise ValueError(f"May not chain aliases "
|
|
508
|
+
f"('{sample_type}' to '{sample_type_alias}')")
|
|
509
|
+
# endif sample type is an alias
|
|
510
|
+
|
|
511
|
+
# if the sample type has a base type
|
|
512
|
+
sample_type_base = sample_type_specific_dict.get(BASE_TYPE_KEY)
|
|
513
|
+
if sample_type_base:
|
|
514
|
+
# get the base's sample type dict and add this sample type's
|
|
515
|
+
# info on top of it
|
|
516
|
+
base_sample_dict = host_sample_types_config_dict[sample_type_base]
|
|
517
|
+
if list(base_sample_dict.keys()) != [METADATA_FIELDS_KEY]:
|
|
518
|
+
raise ValueError(f"Base sample type '{sample_type_base}' "
|
|
519
|
+
f"must only have metadata fields")
|
|
520
|
+
sample_type_specific_dict_metadata = update_wip_metadata_dict(
|
|
521
|
+
deepcopy_dict(base_sample_dict[METADATA_FIELDS_KEY]),
|
|
522
|
+
sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}))
|
|
523
|
+
sample_type_specific_dict = deepcopy_dict(sample_type_specific_dict)
|
|
524
|
+
sample_type_specific_dict[METADATA_FIELDS_KEY] = \
|
|
525
|
+
sample_type_specific_dict_metadata
|
|
526
|
+
# endif sample type has a base type
|
|
527
|
+
|
|
528
|
+
# add the sample-type-specific info generated above on top of the host info
|
|
529
|
+
sample_type_metadata_dict = update_wip_metadata_dict(
|
|
530
|
+
deepcopy_dict(a_host_type_metadata_fields_dict),
|
|
531
|
+
sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}))
|
|
532
|
+
|
|
533
|
+
# set sample_type, and qiita_sample_type if it is not already set
|
|
534
|
+
sample_type_definition = {
|
|
535
|
+
ALLOWED_KEY: [sample_type_for_metadata],
|
|
536
|
+
DEFAULT_KEY: sample_type_for_metadata,
|
|
537
|
+
TYPE_KEY: "string"
|
|
538
|
+
}
|
|
539
|
+
sample_type_metadata_dict = update_wip_metadata_dict(
|
|
540
|
+
sample_type_metadata_dict, {SAMPLE_TYPE_KEY: sample_type_definition})
|
|
541
|
+
if QIITA_SAMPLE_TYPE not in sample_type_metadata_dict:
|
|
542
|
+
sample_type_metadata_dict = update_wip_metadata_dict(
|
|
543
|
+
sample_type_metadata_dict, {QIITA_SAMPLE_TYPE: sample_type_definition})
|
|
544
|
+
# end if qiita_sample_type not already set
|
|
545
|
+
|
|
546
|
+
return sample_type_metadata_dict
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def _resolve_sample_type_aliases_and_bases(
|
|
550
|
+
sample_types_dict: Dict[str, Any],
|
|
551
|
+
host_metadata_fields_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
552
|
+
"""Resolve aliases and base types in sample type definitions.
|
|
553
|
+
|
|
554
|
+
For each sample type in the input dictionary:
|
|
555
|
+
1. If it's an alias, follow the alias and resolve the target's metadata
|
|
556
|
+
2. If it has a base_type, inherit metadata fields from the base
|
|
557
|
+
3. Merge sample-type metadata fields with host-level metadata fields
|
|
558
|
+
4. Add sample_type and qiita_sample_type fields
|
|
559
|
+
|
|
560
|
+
Parameters
|
|
561
|
+
----------
|
|
562
|
+
sample_types_dict : Dict[str, Any]
|
|
563
|
+
Dictionary of sample type configurations (from sample_type_specific_metadata).
|
|
564
|
+
host_metadata_fields_dict : Dict[str, Any]
|
|
565
|
+
Host-level metadata fields to merge into each sample type.
|
|
566
|
+
|
|
567
|
+
Returns
|
|
568
|
+
-------
|
|
569
|
+
Dict[str, Any]
|
|
570
|
+
Dictionary with all sample types resolved.
|
|
571
|
+
|
|
572
|
+
Raises
|
|
573
|
+
------
|
|
574
|
+
ValueError
|
|
575
|
+
If chained aliases are detected or base type has invalid structure.
|
|
576
|
+
"""
|
|
577
|
+
result = {}
|
|
578
|
+
|
|
579
|
+
for sample_type_name in sample_types_dict.keys():
|
|
580
|
+
resolved_metadata = _construct_sample_type_metadata_fields_dict(
|
|
581
|
+
sample_type_name, sample_types_dict, host_metadata_fields_dict)
|
|
582
|
+
|
|
583
|
+
result[sample_type_name] = {
|
|
584
|
+
METADATA_FIELDS_KEY: resolved_metadata
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
return result
|
|
588
|
+
|
|
589
|
+
|
|
453
590
|
def build_full_flat_config_dict(
|
|
454
591
|
study_specific_config_dict: Optional[Dict[str, Any]] = None,
|
|
455
592
|
software_config_dict: Optional[Dict[str, Any]] = None,
|
|
@@ -503,6 +640,14 @@ def build_full_flat_config_dict(
|
|
|
503
640
|
full_nested_hosts_dict, None)
|
|
504
641
|
software_plus_study_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY] = \
|
|
505
642
|
full_flat_hosts_dict
|
|
643
|
+
|
|
644
|
+
# drop the STUDY_SPECIFIC_METADATA_KEY from the final output dict (because
|
|
645
|
+
# its contents have already been incorporated into the
|
|
646
|
+
# HOST_TYPE_SPECIFIC_METADATA_KEY section); note we keep all the other
|
|
647
|
+
# top-level keys from the study-specific config dict
|
|
648
|
+
if STUDY_SPECIFIC_METADATA_KEY in software_plus_study_flat_config_dict:
|
|
649
|
+
del software_plus_study_flat_config_dict[STUDY_SPECIFIC_METADATA_KEY]
|
|
650
|
+
|
|
506
651
|
# this is just a renaming to indicate that, having overwritten any original
|
|
507
652
|
# HOST_TYPE_SPECIFIC_METADATA_KEY in the software_plus_study_flat_config_dict
|
|
508
653
|
# with the complete and flattened combination of software+study+standards, it is now
|
|
@@ -15,7 +15,8 @@ from metameq.src.util import extract_config_dict, \
|
|
|
15
15
|
LEAVE_BLANK_VAL, SAMPLE_NAME_KEY, \
|
|
16
16
|
ALLOWED_KEY, TYPE_KEY, LEAVE_REQUIREDS_BLANK_KEY, OVERWRITE_NON_NANS_KEY, \
|
|
17
17
|
METADATA_TRANSFORMERS_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
|
|
18
|
-
SOURCES_KEY, FUNCTION_KEY, REQUIRED_RAW_METADATA_FIELDS
|
|
18
|
+
SOURCES_KEY, FUNCTION_KEY, REQUIRED_RAW_METADATA_FIELDS, \
|
|
19
|
+
HOSTTYPE_COL_OPTIONS_KEY, SAMPLETYPE_COL_OPTIONS_KEY
|
|
19
20
|
from metameq.src.metadata_configurator import update_wip_metadata_dict, \
|
|
20
21
|
build_full_flat_config_dict
|
|
21
22
|
from metameq.src.metadata_validator import validate_metadata_df, \
|
|
@@ -42,7 +43,7 @@ pandas.set_option("future.no_silent_downcasting", True)
|
|
|
42
43
|
def get_reserved_cols(
|
|
43
44
|
raw_metadata_df: pandas.DataFrame,
|
|
44
45
|
study_specific_config_dict: Dict[str, Any],
|
|
45
|
-
|
|
46
|
+
stds_fp: Optional[str] = None) -> List[str]:
|
|
46
47
|
"""Get a list of all reserved column names for all host+sample type combinations in the metadata.
|
|
47
48
|
|
|
48
49
|
Note that 'reserved' is not the same as 'required'. Some column names (e.g.,
|
|
@@ -55,8 +56,9 @@ def get_reserved_cols(
|
|
|
55
56
|
The input metadata DataFrame.
|
|
56
57
|
study_specific_config_dict : Dict[str, Any]
|
|
57
58
|
Study-specific flat-host-type config dictionary.
|
|
58
|
-
|
|
59
|
-
|
|
59
|
+
stds_fp : Optional[str], default=None
|
|
60
|
+
Path to standards dictionary file. If None, the default standards
|
|
61
|
+
config pulled from the standards.yml file will be used.
|
|
60
62
|
|
|
61
63
|
Returns
|
|
62
64
|
-------
|
|
@@ -90,10 +92,10 @@ def get_reserved_cols(
|
|
|
90
92
|
|
|
91
93
|
# extend the metadata_df to get all the required columns for all host+sample type combinations;
|
|
92
94
|
# we don't really care about the contents of these columns, just their names.
|
|
93
|
-
# (
|
|
95
|
+
# (Likewise, it is not necessary to pass the actual study_specific_transformers_dict so
|
|
96
|
+
# just use None)
|
|
94
97
|
metadata_df, _ = extend_metadata_df(
|
|
95
|
-
temp_df, study_specific_config_dict,
|
|
96
|
-
study_specific_transformers_dict)
|
|
98
|
+
temp_df, study_specific_config_dict, None, None, stds_fp)
|
|
97
99
|
|
|
98
100
|
return sorted(metadata_df.columns.to_list())
|
|
99
101
|
|
|
@@ -119,7 +121,7 @@ def id_missing_cols(a_df: pandas.DataFrame) -> List[str]:
|
|
|
119
121
|
def find_standard_cols(
|
|
120
122
|
a_df: pandas.DataFrame,
|
|
121
123
|
study_specific_config_dict: Dict[str, Any],
|
|
122
|
-
|
|
124
|
+
stds_fp: Optional[str] = None,
|
|
123
125
|
suppress_missing_name_err: bool = False) -> List[str]:
|
|
124
126
|
"""Find all the standard columns in the metadata DataFrame.
|
|
125
127
|
|
|
@@ -129,8 +131,9 @@ def find_standard_cols(
|
|
|
129
131
|
The metadata DataFrame to analyze.
|
|
130
132
|
study_specific_config_dict : Dict[str, Any]
|
|
131
133
|
Study-specific flat-host-type config dictionary.
|
|
132
|
-
|
|
133
|
-
|
|
134
|
+
stds_fp : Optional[str], default=None
|
|
135
|
+
Path to standards dictionary file. If None, the default standards
|
|
136
|
+
config pulled from the standards.yml file will be used.
|
|
134
137
|
suppress_missing_name_err : bool, default=False
|
|
135
138
|
Whether to suppress errors about missing sample name.
|
|
136
139
|
|
|
@@ -156,8 +159,7 @@ def find_standard_cols(
|
|
|
156
159
|
# get the intersection of the reserved standard columns and
|
|
157
160
|
# the columns in the input dataframe
|
|
158
161
|
standard_cols = get_reserved_cols(
|
|
159
|
-
a_df, study_specific_config_dict,
|
|
160
|
-
study_specific_transformers_dict=study_specific_transformers_dict)
|
|
162
|
+
a_df, study_specific_config_dict, stds_fp)
|
|
161
163
|
|
|
162
164
|
standard_cols_set = (set(standard_cols) - set(INTERNAL_COL_KEYS))
|
|
163
165
|
|
|
@@ -167,7 +169,7 @@ def find_standard_cols(
|
|
|
167
169
|
def find_nonstandard_cols(
|
|
168
170
|
a_df: pandas.DataFrame,
|
|
169
171
|
study_specific_config_dict: Dict[str, Any],
|
|
170
|
-
|
|
172
|
+
stds_fp: Optional[str] = None) -> List[str]:
|
|
171
173
|
"""Find any non-standard columns in the metadata DataFrame.
|
|
172
174
|
|
|
173
175
|
Parameters
|
|
@@ -176,8 +178,9 @@ def find_nonstandard_cols(
|
|
|
176
178
|
The metadata DataFrame to analyze.
|
|
177
179
|
study_specific_config_dict : Dict[str, Any]
|
|
178
180
|
Study-specific flat-host-type config dictionary.
|
|
179
|
-
|
|
180
|
-
|
|
181
|
+
stds_fp : Optional[str], default=None
|
|
182
|
+
Path to standards dictionary file. If None, the default standards
|
|
183
|
+
config pulled from the standards.yml file will be used.
|
|
181
184
|
|
|
182
185
|
Returns
|
|
183
186
|
-------
|
|
@@ -195,15 +198,15 @@ def find_nonstandard_cols(
|
|
|
195
198
|
|
|
196
199
|
# get the columns in
|
|
197
200
|
standard_cols = get_reserved_cols(
|
|
198
|
-
a_df, study_specific_config_dict,
|
|
199
|
-
study_specific_transformers_dict=study_specific_transformers_dict)
|
|
201
|
+
a_df, study_specific_config_dict, stds_fp)
|
|
200
202
|
|
|
201
203
|
return list(set(a_df.columns) - set(standard_cols))
|
|
202
204
|
|
|
203
205
|
|
|
204
206
|
def get_extended_metadata_from_df_and_yaml(
|
|
205
207
|
raw_metadata_df: pandas.DataFrame,
|
|
206
|
-
study_specific_config_fp: Optional[str]
|
|
208
|
+
study_specific_config_fp: Optional[str],
|
|
209
|
+
stds_fp: Optional[str] = None) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
|
|
207
210
|
"""Extend metadata using configuration from a study-specific YAML config file.
|
|
208
211
|
|
|
209
212
|
Parameters
|
|
@@ -212,6 +215,9 @@ def get_extended_metadata_from_df_and_yaml(
|
|
|
212
215
|
The raw metadata DataFrame to extend.
|
|
213
216
|
study_specific_config_fp : Optional[str]
|
|
214
217
|
Path to the study-specific configuration YAML file.
|
|
218
|
+
stds_fp : Optional[str], default=None
|
|
219
|
+
Path to standards dictionary file. If None, the default standards
|
|
220
|
+
config pulled from the standards.yml file will be used.
|
|
215
221
|
|
|
216
222
|
Returns
|
|
217
223
|
-------
|
|
@@ -226,7 +232,8 @@ def get_extended_metadata_from_df_and_yaml(
|
|
|
226
232
|
|
|
227
233
|
# extend the metadata DataFrame using the study-specific flat-host-type config dictionary
|
|
228
234
|
metadata_df, validation_msgs_df = \
|
|
229
|
-
extend_metadata_df(raw_metadata_df, study_specific_config_dict
|
|
235
|
+
extend_metadata_df(raw_metadata_df, study_specific_config_dict,
|
|
236
|
+
None, None, stds_fp)
|
|
230
237
|
|
|
231
238
|
return metadata_df, validation_msgs_df
|
|
232
239
|
|
|
@@ -257,7 +264,8 @@ def write_extended_metadata(
|
|
|
257
264
|
out_name_base: str,
|
|
258
265
|
sep: str = "\t",
|
|
259
266
|
remove_internals: bool = True,
|
|
260
|
-
suppress_empty_fails: bool = False
|
|
267
|
+
suppress_empty_fails: bool = False,
|
|
268
|
+
stds_fp: Optional[str] = None) -> pandas.DataFrame:
|
|
261
269
|
"""Write extended metadata to files starting from input file paths to metadata and config.
|
|
262
270
|
|
|
263
271
|
Parameters
|
|
@@ -276,6 +284,9 @@ def write_extended_metadata(
|
|
|
276
284
|
Whether to remove internal columns.
|
|
277
285
|
suppress_empty_fails : bool, default=False
|
|
278
286
|
Whether to suppress empty failure files.
|
|
287
|
+
stds_fp : Optional[str], default=None
|
|
288
|
+
Path to standards dictionary file. If None, the default standards
|
|
289
|
+
config pulled from the standards.yml file will be used.
|
|
279
290
|
|
|
280
291
|
Returns
|
|
281
292
|
-------
|
|
@@ -310,7 +321,8 @@ def write_extended_metadata(
|
|
|
310
321
|
raw_metadata_df, study_specific_config_dict,
|
|
311
322
|
out_dir, out_name_base, sep=sep,
|
|
312
323
|
remove_internals=remove_internals,
|
|
313
|
-
suppress_empty_fails=suppress_empty_fails
|
|
324
|
+
suppress_empty_fails=suppress_empty_fails,
|
|
325
|
+
stds_fp=stds_fp)
|
|
314
326
|
|
|
315
327
|
# for good measure, return the extended metadata DataFrame
|
|
316
328
|
return extended_df
|
|
@@ -351,7 +363,8 @@ def write_extended_metadata_from_df(
|
|
|
351
363
|
sep: str = "\t",
|
|
352
364
|
remove_internals: bool = True,
|
|
353
365
|
suppress_empty_fails: bool = False,
|
|
354
|
-
internal_col_names: Optional[List[str]] = None
|
|
366
|
+
internal_col_names: Optional[List[str]] = None,
|
|
367
|
+
stds_fp: Optional[str] = None) -> pandas.DataFrame:
|
|
355
368
|
"""Write extended metadata to files starting from a metadata DataFrame and config dictionary.
|
|
356
369
|
|
|
357
370
|
Parameters
|
|
@@ -374,6 +387,9 @@ def write_extended_metadata_from_df(
|
|
|
374
387
|
Whether to suppress empty failure files.
|
|
375
388
|
internal_col_names : Optional[List[str]], default=None
|
|
376
389
|
List of internal column names.
|
|
390
|
+
stds_fp : Optional[str], default=None
|
|
391
|
+
Path to standards dictionary file. If None, the default standards
|
|
392
|
+
config pulled from the standards.yml file will be used.
|
|
377
393
|
|
|
378
394
|
Returns
|
|
379
395
|
-------
|
|
@@ -383,7 +399,7 @@ def write_extended_metadata_from_df(
|
|
|
383
399
|
# extend the metadata DataFrame using the study-specific flat-host-type config dictionary
|
|
384
400
|
metadata_df, validation_msgs_df = extend_metadata_df(
|
|
385
401
|
raw_metadata_df, study_specific_config_dict,
|
|
386
|
-
study_specific_transformers_dict)
|
|
402
|
+
study_specific_transformers_dict, None, stds_fp)
|
|
387
403
|
|
|
388
404
|
# write the metadata and validation results to files
|
|
389
405
|
write_metadata_results(
|
|
@@ -432,13 +448,22 @@ def extend_metadata_df(
|
|
|
432
448
|
ValueError
|
|
433
449
|
If required columns are missing from the metadata.
|
|
434
450
|
"""
|
|
451
|
+
full_flat_config_dict = build_full_flat_config_dict(
|
|
452
|
+
study_specific_config_dict, software_config_dict, stds_fp)
|
|
453
|
+
|
|
454
|
+
needed_cols = [(HOSTTYPE_SHORTHAND_KEY, HOSTTYPE_COL_OPTIONS_KEY),
|
|
455
|
+
(SAMPLETYPE_SHORTHAND_KEY, SAMPLETYPE_COL_OPTIONS_KEY)]
|
|
456
|
+
for curr_key, curr_options_key in needed_cols:
|
|
457
|
+
if curr_key not in raw_metadata_df.columns:
|
|
458
|
+
specified_name = _get_specified_column_name(
|
|
459
|
+
curr_options_key, raw_metadata_df, full_flat_config_dict)
|
|
460
|
+
if specified_name:
|
|
461
|
+
raw_metadata_df[curr_key] = raw_metadata_df[specified_name]
|
|
462
|
+
|
|
435
463
|
validate_required_columns_exist(
|
|
436
464
|
raw_metadata_df, REQUIRED_RAW_METADATA_FIELDS,
|
|
437
465
|
"metadata missing required columns")
|
|
438
466
|
|
|
439
|
-
full_flat_config_dict = build_full_flat_config_dict(
|
|
440
|
-
study_specific_config_dict, software_config_dict, stds_fp)
|
|
441
|
-
|
|
442
467
|
metadata_df, validation_msgs_df = _populate_metadata_df(
|
|
443
468
|
raw_metadata_df, full_flat_config_dict,
|
|
444
469
|
study_specific_transformers_dict)
|
|
@@ -446,6 +471,40 @@ def extend_metadata_df(
|
|
|
446
471
|
return metadata_df, validation_msgs_df
|
|
447
472
|
|
|
448
473
|
|
|
474
|
+
def _get_specified_column_name(
|
|
475
|
+
col_options_key: str,
|
|
476
|
+
raw_metadata_df: pandas.DataFrame,
|
|
477
|
+
config_dict: Dict[str, Any] = None) -> Optional[str]:
|
|
478
|
+
"""Get the specified type of column name from the metadata DataFrame based on possible options.
|
|
479
|
+
|
|
480
|
+
Parameters
|
|
481
|
+
----------
|
|
482
|
+
col_options_key : str
|
|
483
|
+
Key in the config dict that holds the list of possible column names to check.
|
|
484
|
+
raw_metadata_df : pandas.DataFrame
|
|
485
|
+
The metadata DataFrame to check.
|
|
486
|
+
config_dict : Dict[str, Any], default=None
|
|
487
|
+
Configuration dictionary. If provided, may contain a list of possible
|
|
488
|
+
column names under the key specified by col_options_key.
|
|
489
|
+
If None, defaults to values from the main config.yml file.
|
|
490
|
+
Returns
|
|
491
|
+
-------
|
|
492
|
+
Optional[str]
|
|
493
|
+
The specified column name found in the DataFrame, or None if not found.
|
|
494
|
+
"""
|
|
495
|
+
found_name = None
|
|
496
|
+
|
|
497
|
+
if not config_dict:
|
|
498
|
+
config_dict = extract_config_dict(None)
|
|
499
|
+
col_options = config_dict.get(col_options_key)
|
|
500
|
+
if col_options:
|
|
501
|
+
for col_name in col_options:
|
|
502
|
+
if col_name in raw_metadata_df.columns:
|
|
503
|
+
found_name = col_name
|
|
504
|
+
break
|
|
505
|
+
|
|
506
|
+
return found_name
|
|
507
|
+
|
|
449
508
|
def write_metadata_results(
|
|
450
509
|
metadata_df: pandas.DataFrame,
|
|
451
510
|
validation_msgs_df: pandas.DataFrame,
|
|
@@ -814,12 +873,6 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
|
814
873
|
- The updated metadata DataFrame with sample-type-specific elements added
|
|
815
874
|
- A list of validation messages
|
|
816
875
|
"""
|
|
817
|
-
# copy the metadata fields dict from the host type config to be the
|
|
818
|
-
# basis of the work-in-progress metadata dict--these are the default fields
|
|
819
|
-
# that will be overwritten, if necessary, by sample type-specific fields
|
|
820
|
-
wip_metadata_fields_dict = deepcopy_dict(
|
|
821
|
-
a_host_type_config_dict.get(METADATA_FIELDS_KEY, {}))
|
|
822
|
-
|
|
823
876
|
# get the config section for *all* sample types within this host type
|
|
824
877
|
host_sample_types_config_dict = \
|
|
825
878
|
a_host_type_config_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY]
|
|
@@ -837,20 +890,17 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
|
837
890
|
update_metadata_df_field(
|
|
838
891
|
sample_type_df, QC_NOTE_KEY, "invalid sample_type")
|
|
839
892
|
else:
|
|
840
|
-
#
|
|
841
|
-
#
|
|
842
|
-
#
|
|
893
|
+
# Get the already-resolved metadata fields dict for this sample type.
|
|
894
|
+
# The config is pre-resolved: aliases/base types are merged and
|
|
895
|
+
# host metadata is combined.
|
|
896
|
+
sample_type_config = host_sample_types_config_dict[a_sample_type]
|
|
843
897
|
full_sample_type_metadata_fields_dict = \
|
|
844
|
-
|
|
845
|
-
a_sample_type, host_sample_types_config_dict, wip_metadata_fields_dict)
|
|
898
|
+
sample_type_config.get(METADATA_FIELDS_KEY, {})
|
|
846
899
|
|
|
847
900
|
# update the metadata df with the sample type specific metadata fields
|
|
848
|
-
# TODO: this is taking in wip_metadata_fields_dict instead of full_sample_type_metadata_fields_dict,
|
|
849
|
-
# which only works because the code underlying _construct_sample_type_metadata_fields_dict
|
|
850
|
-
# is *modifying* wip_metadata_fields_dict in place. This should be corrected, but that
|
|
851
|
-
# needs to wait until there are tests to make sure doing so doesn't break anything.
|
|
852
901
|
sample_type_df = _update_metadata_from_dict(
|
|
853
|
-
sample_type_df,
|
|
902
|
+
sample_type_df, full_sample_type_metadata_fields_dict,
|
|
903
|
+
dict_is_metadata_fields=True,
|
|
854
904
|
overwrite_non_nans=global_plus_host_settings_dict[OVERWRITE_NON_NANS_KEY])
|
|
855
905
|
|
|
856
906
|
# for fields that are required but not yet filled, replace the placeholder with
|
|
@@ -27,6 +27,8 @@ SOURCES_KEY = "sources"
|
|
|
27
27
|
FUNCTION_KEY = "function"
|
|
28
28
|
LEAVE_REQUIREDS_BLANK_KEY = "leave_requireds_blank"
|
|
29
29
|
OVERWRITE_NON_NANS_KEY = "overwrite_non_nans"
|
|
30
|
+
HOSTTYPE_COL_OPTIONS_KEY = "hosttype_column_options"
|
|
31
|
+
SAMPLETYPE_COL_OPTIONS_KEY = "sampletype_column_options"
|
|
30
32
|
|
|
31
33
|
# internal code keys
|
|
32
34
|
HOSTTYPE_SHORTHAND_KEY = "hosttype_shorthand"
|