metameq 2026.1.1__tar.gz → 2026.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {metameq-2026.1.1/metameq.egg-info → metameq-2026.2.1}/PKG-INFO +1 -1
  2. {metameq-2026.1.1 → metameq-2026.2.1}/environment.yml +1 -0
  3. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/_version.py +3 -3
  4. metameq-2026.2.1/metameq/config/config.yml +7 -0
  5. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_configurator.py +146 -1
  6. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_extender.py +92 -42
  7. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/util.py +2 -0
  8. metameq-2026.2.1/metameq/tests/test_metadata_configurator.py +4870 -0
  9. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_metadata_extender.py +1801 -126
  10. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_metadata_merger.py +1 -1
  11. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_util.py +1 -1
  12. {metameq-2026.1.1 → metameq-2026.2.1/metameq.egg-info}/PKG-INFO +1 -1
  13. metameq-2026.1.1/metameq/config/config.yml +0 -3
  14. metameq-2026.1.1/metameq/tests/test_metadata_configurator.py +0 -2334
  15. {metameq-2026.1.1 → metameq-2026.2.1}/.gitattributes +0 -0
  16. {metameq-2026.1.1 → metameq-2026.2.1}/.github/workflows/main.yaml +0 -0
  17. {metameq-2026.1.1 → metameq-2026.2.1}/.gitignore +0 -0
  18. {metameq-2026.1.1 → metameq-2026.2.1}/README.md +0 -0
  19. {metameq-2026.1.1 → metameq-2026.2.1}/assets/metameq.png +0 -0
  20. {metameq-2026.1.1 → metameq-2026.2.1}/assets/metameq_dark.svg +0 -0
  21. {metameq-2026.1.1 → metameq-2026.2.1}/assets/metameq_light.svg +0 -0
  22. {metameq-2026.1.1 → metameq-2026.2.1}/assets/metameq_medium.png +0 -0
  23. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/__init__.py +0 -0
  24. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/config/__init__.py +0 -0
  25. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/config/standards.yml +0 -0
  26. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/__init__.py +0 -0
  27. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/__main__.py +0 -0
  28. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_merger.py +0 -0
  29. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_transformers.py +0 -0
  30. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/src/metadata_validator.py +0 -0
  31. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/__init__.py +0 -0
  32. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/data/invalid.yml +0 -0
  33. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/data/test_config.yml +0 -0
  34. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_metadata_transformers.py +0 -0
  35. {metameq-2026.1.1 → metameq-2026.2.1}/metameq/tests/test_metadata_validator.py +0 -0
  36. {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/SOURCES.txt +0 -0
  37. {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/dependency_links.txt +0 -0
  38. {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/entry_points.txt +0 -0
  39. {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/requires.txt +0 -0
  40. {metameq-2026.1.1 → metameq-2026.2.1}/metameq.egg-info/top_level.txt +0 -0
  41. {metameq-2026.1.1 → metameq-2026.2.1}/setup.cfg +0 -0
  42. {metameq-2026.1.1 → metameq-2026.2.1}/setup.py +0 -0
  43. {metameq-2026.1.1 → metameq-2026.2.1}/versioneer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metameq
3
- Version: 2026.1.1
3
+ Version: 2026.2.1
4
4
  Summary: Qiita-compliant metadata generation and validation tool
5
5
  Home-page: https://github.com/AmandaBirmingham/metameq
6
6
  Author: Amanda Birmingham
@@ -10,5 +10,6 @@ dependencies:
10
10
  - pyyaml
11
11
  - flake8
12
12
  - pep8
13
+ - pytest
13
14
  - pip:
14
15
  - cerberus
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2026-01-28T14:30:42-0800",
11
+ "date": "2026-02-01T21:32:06-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "b60757af0c4b7b16d71119971565d9991779f6d2",
15
- "version": "2026.01.1"
14
+ "full-revisionid": "87171fd73f555e2c03a15fa36ed9b5a912b824e9",
15
+ "version": "2026.02.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,7 @@
1
+ "default": "not applicable"
2
+ "leave_requireds_blank": false
3
+ "overwrite_non_nans": false
4
+ "hosttype_column_options":
5
+ - "host_common_name"
6
+ "sampletype_column_options":
7
+ - "sample_type"
@@ -4,7 +4,8 @@ from metameq.src.util import extract_config_dict, extract_stds_config, \
4
4
  METADATA_FIELDS_KEY, STUDY_SPECIFIC_METADATA_KEY, \
5
5
  HOST_TYPE_SPECIFIC_METADATA_KEY, \
6
6
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY, ALIAS_KEY, BASE_TYPE_KEY, \
7
- DEFAULT_KEY, ALLOWED_KEY, ANYOF_KEY, TYPE_KEY
7
+ DEFAULT_KEY, ALLOWED_KEY, ANYOF_KEY, TYPE_KEY, \
8
+ SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE
8
9
 
9
10
 
10
11
  def combine_stds_and_study_config(
@@ -89,6 +90,15 @@ def flatten_nested_stds_dict(
89
90
  if curr_host_type_sub_host_dict:
90
91
  wip_host_types_dict.update(curr_host_type_sub_host_dict)
91
92
 
93
+ # resolve aliases and base types for this host's sample types
94
+ # This happens AFTER recursion so children inherit unresolved aliases,
95
+ # ensuring correct bottom-up resolution order
96
+ if SAMPLE_TYPE_SPECIFIC_METADATA_KEY in curr_host_type_wip_flat_dict:
97
+ curr_host_type_wip_flat_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY] = \
98
+ _resolve_sample_type_aliases_and_bases(
99
+ curr_host_type_wip_flat_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY],
100
+ curr_host_type_wip_flat_dict.get(METADATA_FIELDS_KEY, {}))
101
+
92
102
  # assign the flattened wip dict for the current host type to the result
93
103
  # (which now contains flat records for the hosts lower down than
94
104
  # this, if there are any)
@@ -270,8 +280,11 @@ def _combine_base_and_added_host_type(
270
280
  _combine_base_and_added_sample_type_specific_metadata(
271
281
  host_type_wip_nested_dict,
272
282
  host_type_add_dict)
283
+
273
284
  # if we got back a non-empty dictionary of sample types,
274
285
  # add it to the wip for this host type dict
286
+ # Note: resolution of aliases/base types happens in flatten_nested_stds_dict
287
+ # AFTER recursion, to ensure correct bottom-up resolution order
275
288
  if curr_host_wip_sample_types_dict:
276
289
  host_type_wip_nested_dict[
277
290
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY] = \
@@ -450,6 +463,130 @@ def _id_sample_type_definition(sample_type_name: str, sample_type_dict: Dict[str
450
463
  "the same sample type dict")
451
464
 
452
465
 
466
+ def _construct_sample_type_metadata_fields_dict(
467
+ sample_type: str,
468
+ host_sample_types_config_dict: Dict[str, Any],
469
+ a_host_type_metadata_fields_dict: Dict[str, Any]) -> Dict[str, Any]:
470
+ """Construct metadata fields dictionary for a specific host+sample type, resolving aliases and base types.
471
+
472
+ Parameters
473
+ ----------
474
+ sample_type : str
475
+ The sample type to process.
476
+ host_sample_types_config_dict : Dict[str, Any]
477
+ Dictionary containing config for *all* sample types in
478
+ the host type in question.
479
+ a_host_type_metadata_fields_dict : Dict[str, Any]
480
+ Dictionary containing metadata fields for the host type in question.
481
+
482
+ Returns
483
+ -------
484
+ Dict[str, Any]
485
+ The constructed metadata fields dictionary for this host-and-sample-type combination.
486
+
487
+ Raises
488
+ ------
489
+ ValueError
490
+ If there are invalid alias chains or base type configurations.
491
+ """
492
+ sample_type_for_metadata = sample_type
493
+
494
+ # get dict associated with the naive sample type
495
+ sample_type_specific_dict = \
496
+ host_sample_types_config_dict[sample_type]
497
+
498
+ # if naive sample type contains an alias
499
+ sample_type_alias = sample_type_specific_dict.get(ALIAS_KEY)
500
+ if sample_type_alias:
501
+ # change the sample type to the alias sample type
502
+ # and use the alias's sample type dict
503
+ sample_type_for_metadata = sample_type_alias
504
+ sample_type_specific_dict = \
505
+ host_sample_types_config_dict[sample_type_alias]
506
+ if METADATA_FIELDS_KEY not in sample_type_specific_dict:
507
+ raise ValueError(f"May not chain aliases "
508
+ f"('{sample_type}' to '{sample_type_alias}')")
509
+ # endif sample type is an alias
510
+
511
+ # if the sample type has a base type
512
+ sample_type_base = sample_type_specific_dict.get(BASE_TYPE_KEY)
513
+ if sample_type_base:
514
+ # get the base's sample type dict and add this sample type's
515
+ # info on top of it
516
+ base_sample_dict = host_sample_types_config_dict[sample_type_base]
517
+ if list(base_sample_dict.keys()) != [METADATA_FIELDS_KEY]:
518
+ raise ValueError(f"Base sample type '{sample_type_base}' "
519
+ f"must only have metadata fields")
520
+ sample_type_specific_dict_metadata = update_wip_metadata_dict(
521
+ deepcopy_dict(base_sample_dict[METADATA_FIELDS_KEY]),
522
+ sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}))
523
+ sample_type_specific_dict = deepcopy_dict(sample_type_specific_dict)
524
+ sample_type_specific_dict[METADATA_FIELDS_KEY] = \
525
+ sample_type_specific_dict_metadata
526
+ # endif sample type has a base type
527
+
528
+ # add the sample-type-specific info generated above on top of the host info
529
+ sample_type_metadata_dict = update_wip_metadata_dict(
530
+ deepcopy_dict(a_host_type_metadata_fields_dict),
531
+ sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}))
532
+
533
+ # set sample_type, and qiita_sample_type if it is not already set
534
+ sample_type_definition = {
535
+ ALLOWED_KEY: [sample_type_for_metadata],
536
+ DEFAULT_KEY: sample_type_for_metadata,
537
+ TYPE_KEY: "string"
538
+ }
539
+ sample_type_metadata_dict = update_wip_metadata_dict(
540
+ sample_type_metadata_dict, {SAMPLE_TYPE_KEY: sample_type_definition})
541
+ if QIITA_SAMPLE_TYPE not in sample_type_metadata_dict:
542
+ sample_type_metadata_dict = update_wip_metadata_dict(
543
+ sample_type_metadata_dict, {QIITA_SAMPLE_TYPE: sample_type_definition})
544
+ # end if qiita_sample_type not already set
545
+
546
+ return sample_type_metadata_dict
547
+
548
+
549
+ def _resolve_sample_type_aliases_and_bases(
550
+ sample_types_dict: Dict[str, Any],
551
+ host_metadata_fields_dict: Dict[str, Any]) -> Dict[str, Any]:
552
+ """Resolve aliases and base types in sample type definitions.
553
+
554
+ For each sample type in the input dictionary:
555
+ 1. If it's an alias, follow the alias and resolve the target's metadata
556
+ 2. If it has a base_type, inherit metadata fields from the base
557
+ 3. Merge sample-type metadata fields with host-level metadata fields
558
+ 4. Add sample_type and qiita_sample_type fields
559
+
560
+ Parameters
561
+ ----------
562
+ sample_types_dict : Dict[str, Any]
563
+ Dictionary of sample type configurations (from sample_type_specific_metadata).
564
+ host_metadata_fields_dict : Dict[str, Any]
565
+ Host-level metadata fields to merge into each sample type.
566
+
567
+ Returns
568
+ -------
569
+ Dict[str, Any]
570
+ Dictionary with all sample types resolved.
571
+
572
+ Raises
573
+ ------
574
+ ValueError
575
+ If chained aliases are detected or base type has invalid structure.
576
+ """
577
+ result = {}
578
+
579
+ for sample_type_name in sample_types_dict.keys():
580
+ resolved_metadata = _construct_sample_type_metadata_fields_dict(
581
+ sample_type_name, sample_types_dict, host_metadata_fields_dict)
582
+
583
+ result[sample_type_name] = {
584
+ METADATA_FIELDS_KEY: resolved_metadata
585
+ }
586
+
587
+ return result
588
+
589
+
453
590
  def build_full_flat_config_dict(
454
591
  study_specific_config_dict: Optional[Dict[str, Any]] = None,
455
592
  software_config_dict: Optional[Dict[str, Any]] = None,
@@ -503,6 +640,14 @@ def build_full_flat_config_dict(
503
640
  full_nested_hosts_dict, None)
504
641
  software_plus_study_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY] = \
505
642
  full_flat_hosts_dict
643
+
644
+ # drop the STUDY_SPECIFIC_METADATA_KEY from the final output dict (because
645
+ # its contents have already been incorporated into the
646
+ # HOST_TYPE_SPECIFIC_METADATA_KEY section); note we keep all the other
647
+ # top-level keys from the study-specific config dict
648
+ if STUDY_SPECIFIC_METADATA_KEY in software_plus_study_flat_config_dict:
649
+ del software_plus_study_flat_config_dict[STUDY_SPECIFIC_METADATA_KEY]
650
+
506
651
  # this is just a renaming to indicate that, having overwritten any original
507
652
  # HOST_TYPE_SPECIFIC_METADATA_KEY in the software_plus_study_flat_config_dict
508
653
  # with the complete and flattened combination of software+study+standards, it is now
@@ -15,7 +15,8 @@ from metameq.src.util import extract_config_dict, \
15
15
  LEAVE_BLANK_VAL, SAMPLE_NAME_KEY, \
16
16
  ALLOWED_KEY, TYPE_KEY, LEAVE_REQUIREDS_BLANK_KEY, OVERWRITE_NON_NANS_KEY, \
17
17
  METADATA_TRANSFORMERS_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
18
- SOURCES_KEY, FUNCTION_KEY, REQUIRED_RAW_METADATA_FIELDS
18
+ SOURCES_KEY, FUNCTION_KEY, REQUIRED_RAW_METADATA_FIELDS, \
19
+ HOSTTYPE_COL_OPTIONS_KEY, SAMPLETYPE_COL_OPTIONS_KEY
19
20
  from metameq.src.metadata_configurator import update_wip_metadata_dict, \
20
21
  build_full_flat_config_dict
21
22
  from metameq.src.metadata_validator import validate_metadata_df, \
@@ -42,7 +43,7 @@ pandas.set_option("future.no_silent_downcasting", True)
42
43
  def get_reserved_cols(
43
44
  raw_metadata_df: pandas.DataFrame,
44
45
  study_specific_config_dict: Dict[str, Any],
45
- study_specific_transformers_dict: Optional[Dict[str, Any]] = None) -> List[str]:
46
+ stds_fp: Optional[str] = None) -> List[str]:
46
47
  """Get a list of all reserved column names for all host+sample type combinations in the metadata.
47
48
 
48
49
  Note that 'reserved' is not the same as 'required'. Some column names (e.g.,
@@ -55,8 +56,9 @@ def get_reserved_cols(
55
56
  The input metadata DataFrame.
56
57
  study_specific_config_dict : Dict[str, Any]
57
58
  Study-specific flat-host-type config dictionary.
58
- study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
59
- Dictionary of custom transformers for this study (only).
59
+ stds_fp : Optional[str], default=None
60
+ Path to standards dictionary file. If None, the default standards
61
+ config pulled from the standards.yml file will be used.
60
62
 
61
63
  Returns
62
64
  -------
@@ -90,10 +92,10 @@ def get_reserved_cols(
90
92
 
91
93
  # extend the metadata_df to get all the required columns for all host+sample type combinations;
92
94
  # we don't really care about the contents of these columns, just their names.
93
- # (I doubt it is necessary to pass the actual study_specific_transformers_dict; could probably just use None)
95
+ # (Likewise, it is not necessary to pass the actual study_specific_transformers_dict so
96
+ # just use None)
94
97
  metadata_df, _ = extend_metadata_df(
95
- temp_df, study_specific_config_dict,
96
- study_specific_transformers_dict)
98
+ temp_df, study_specific_config_dict, None, None, stds_fp)
97
99
 
98
100
  return sorted(metadata_df.columns.to_list())
99
101
 
@@ -119,7 +121,7 @@ def id_missing_cols(a_df: pandas.DataFrame) -> List[str]:
119
121
  def find_standard_cols(
120
122
  a_df: pandas.DataFrame,
121
123
  study_specific_config_dict: Dict[str, Any],
122
- study_specific_transformers_dict: Optional[Dict[str, Any]] = None,
124
+ stds_fp: Optional[str] = None,
123
125
  suppress_missing_name_err: bool = False) -> List[str]:
124
126
  """Find all the standard columns in the metadata DataFrame.
125
127
 
@@ -129,8 +131,9 @@ def find_standard_cols(
129
131
  The metadata DataFrame to analyze.
130
132
  study_specific_config_dict : Dict[str, Any]
131
133
  Study-specific flat-host-type config dictionary.
132
- study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
133
- Dictionary of custom transformers for this study (only).
134
+ stds_fp : Optional[str], default=None
135
+ Path to standards dictionary file. If None, the default standards
136
+ config pulled from the standards.yml file will be used.
134
137
  suppress_missing_name_err : bool, default=False
135
138
  Whether to suppress errors about missing sample name.
136
139
 
@@ -156,8 +159,7 @@ def find_standard_cols(
156
159
  # get the intersection of the reserved standard columns and
157
160
  # the columns in the input dataframe
158
161
  standard_cols = get_reserved_cols(
159
- a_df, study_specific_config_dict,
160
- study_specific_transformers_dict=study_specific_transformers_dict)
162
+ a_df, study_specific_config_dict, stds_fp)
161
163
 
162
164
  standard_cols_set = (set(standard_cols) - set(INTERNAL_COL_KEYS))
163
165
 
@@ -167,7 +169,7 @@ def find_standard_cols(
167
169
  def find_nonstandard_cols(
168
170
  a_df: pandas.DataFrame,
169
171
  study_specific_config_dict: Dict[str, Any],
170
- study_specific_transformers_dict: Optional[Dict[str, Any]] = None) -> List[str]:
172
+ stds_fp: Optional[str] = None) -> List[str]:
171
173
  """Find any non-standard columns in the metadata DataFrame.
172
174
 
173
175
  Parameters
@@ -176,8 +178,9 @@ def find_nonstandard_cols(
176
178
  The metadata DataFrame to analyze.
177
179
  study_specific_config_dict : Dict[str, Any]
178
180
  Study-specific flat-host-type config dictionary.
179
- study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
180
- Dictionary of custom transformers for this study (only).
181
+ stds_fp : Optional[str], default=None
182
+ Path to standards dictionary file. If None, the default standards
183
+ config pulled from the standards.yml file will be used.
181
184
 
182
185
  Returns
183
186
  -------
@@ -195,15 +198,15 @@ def find_nonstandard_cols(
195
198
 
196
199
  # get the columns in
197
200
  standard_cols = get_reserved_cols(
198
- a_df, study_specific_config_dict,
199
- study_specific_transformers_dict=study_specific_transformers_dict)
201
+ a_df, study_specific_config_dict, stds_fp)
200
202
 
201
203
  return list(set(a_df.columns) - set(standard_cols))
202
204
 
203
205
 
204
206
  def get_extended_metadata_from_df_and_yaml(
205
207
  raw_metadata_df: pandas.DataFrame,
206
- study_specific_config_fp: Optional[str]) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
208
+ study_specific_config_fp: Optional[str],
209
+ stds_fp: Optional[str] = None) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
207
210
  """Extend metadata using configuration from a study-specific YAML config file.
208
211
 
209
212
  Parameters
@@ -212,6 +215,9 @@ def get_extended_metadata_from_df_and_yaml(
212
215
  The raw metadata DataFrame to extend.
213
216
  study_specific_config_fp : Optional[str]
214
217
  Path to the study-specific configuration YAML file.
218
+ stds_fp : Optional[str], default=None
219
+ Path to standards dictionary file. If None, the default standards
220
+ config pulled from the standards.yml file will be used.
215
221
 
216
222
  Returns
217
223
  -------
@@ -226,7 +232,8 @@ def get_extended_metadata_from_df_and_yaml(
226
232
 
227
233
  # extend the metadata DataFrame using the study-specific flat-host-type config dictionary
228
234
  metadata_df, validation_msgs_df = \
229
- extend_metadata_df(raw_metadata_df, study_specific_config_dict)
235
+ extend_metadata_df(raw_metadata_df, study_specific_config_dict,
236
+ None, None, stds_fp)
230
237
 
231
238
  return metadata_df, validation_msgs_df
232
239
 
@@ -257,7 +264,8 @@ def write_extended_metadata(
257
264
  out_name_base: str,
258
265
  sep: str = "\t",
259
266
  remove_internals: bool = True,
260
- suppress_empty_fails: bool = False) -> pandas.DataFrame:
267
+ suppress_empty_fails: bool = False,
268
+ stds_fp: Optional[str] = None) -> pandas.DataFrame:
261
269
  """Write extended metadata to files starting from input file paths to metadata and config.
262
270
 
263
271
  Parameters
@@ -276,6 +284,9 @@ def write_extended_metadata(
276
284
  Whether to remove internal columns.
277
285
  suppress_empty_fails : bool, default=False
278
286
  Whether to suppress empty failure files.
287
+ stds_fp : Optional[str], default=None
288
+ Path to standards dictionary file. If None, the default standards
289
+ config pulled from the standards.yml file will be used.
279
290
 
280
291
  Returns
281
292
  -------
@@ -310,7 +321,8 @@ def write_extended_metadata(
310
321
  raw_metadata_df, study_specific_config_dict,
311
322
  out_dir, out_name_base, sep=sep,
312
323
  remove_internals=remove_internals,
313
- suppress_empty_fails=suppress_empty_fails)
324
+ suppress_empty_fails=suppress_empty_fails,
325
+ stds_fp=stds_fp)
314
326
 
315
327
  # for good measure, return the extended metadata DataFrame
316
328
  return extended_df
@@ -351,7 +363,8 @@ def write_extended_metadata_from_df(
351
363
  sep: str = "\t",
352
364
  remove_internals: bool = True,
353
365
  suppress_empty_fails: bool = False,
354
- internal_col_names: Optional[List[str]] = None) -> pandas.DataFrame:
366
+ internal_col_names: Optional[List[str]] = None,
367
+ stds_fp: Optional[str] = None) -> pandas.DataFrame:
355
368
  """Write extended metadata to files starting from a metadata DataFrame and config dictionary.
356
369
 
357
370
  Parameters
@@ -374,6 +387,9 @@ def write_extended_metadata_from_df(
374
387
  Whether to suppress empty failure files.
375
388
  internal_col_names : Optional[List[str]], default=None
376
389
  List of internal column names.
390
+ stds_fp : Optional[str], default=None
391
+ Path to standards dictionary file. If None, the default standards
392
+ config pulled from the standards.yml file will be used.
377
393
 
378
394
  Returns
379
395
  -------
@@ -383,7 +399,7 @@ def write_extended_metadata_from_df(
383
399
  # extend the metadata DataFrame using the study-specific flat-host-type config dictionary
384
400
  metadata_df, validation_msgs_df = extend_metadata_df(
385
401
  raw_metadata_df, study_specific_config_dict,
386
- study_specific_transformers_dict)
402
+ study_specific_transformers_dict, None, stds_fp)
387
403
 
388
404
  # write the metadata and validation results to files
389
405
  write_metadata_results(
@@ -432,13 +448,22 @@ def extend_metadata_df(
432
448
  ValueError
433
449
  If required columns are missing from the metadata.
434
450
  """
451
+ full_flat_config_dict = build_full_flat_config_dict(
452
+ study_specific_config_dict, software_config_dict, stds_fp)
453
+
454
+ needed_cols = [(HOSTTYPE_SHORTHAND_KEY, HOSTTYPE_COL_OPTIONS_KEY),
455
+ (SAMPLETYPE_SHORTHAND_KEY, SAMPLETYPE_COL_OPTIONS_KEY)]
456
+ for curr_key, curr_options_key in needed_cols:
457
+ if curr_key not in raw_metadata_df.columns:
458
+ specified_name = _get_specified_column_name(
459
+ curr_options_key, raw_metadata_df, full_flat_config_dict)
460
+ if specified_name:
461
+ raw_metadata_df[curr_key] = raw_metadata_df[specified_name]
462
+
435
463
  validate_required_columns_exist(
436
464
  raw_metadata_df, REQUIRED_RAW_METADATA_FIELDS,
437
465
  "metadata missing required columns")
438
466
 
439
- full_flat_config_dict = build_full_flat_config_dict(
440
- study_specific_config_dict, software_config_dict, stds_fp)
441
-
442
467
  metadata_df, validation_msgs_df = _populate_metadata_df(
443
468
  raw_metadata_df, full_flat_config_dict,
444
469
  study_specific_transformers_dict)
@@ -446,6 +471,40 @@ def extend_metadata_df(
446
471
  return metadata_df, validation_msgs_df
447
472
 
448
473
 
474
+ def _get_specified_column_name(
475
+ col_options_key: str,
476
+ raw_metadata_df: pandas.DataFrame,
477
+ config_dict: Dict[str, Any] = None) -> Optional[str]:
478
+ """Get the specified type of column name from the metadata DataFrame based on possible options.
479
+
480
+ Parameters
481
+ ----------
482
+ col_options_key : str
483
+ Key in the config dict that holds the list of possible column names to check.
484
+ raw_metadata_df : pandas.DataFrame
485
+ The metadata DataFrame to check.
486
+ config_dict : Dict[str, Any], default=None
487
+ Configuration dictionary. If provided, may contain a list of possible
488
+ column names under the key specified by col_options_key.
489
+ If None, defaults to values from the main config.yml file.
490
+ Returns
491
+ -------
492
+ Optional[str]
493
+ The specified column name found in the DataFrame, or None if not found.
494
+ """
495
+ found_name = None
496
+
497
+ if not config_dict:
498
+ config_dict = extract_config_dict(None)
499
+ col_options = config_dict.get(col_options_key)
500
+ if col_options:
501
+ for col_name in col_options:
502
+ if col_name in raw_metadata_df.columns:
503
+ found_name = col_name
504
+ break
505
+
506
+ return found_name
507
+
449
508
  def write_metadata_results(
450
509
  metadata_df: pandas.DataFrame,
451
510
  validation_msgs_df: pandas.DataFrame,
@@ -814,12 +873,6 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
814
873
  - The updated metadata DataFrame with sample-type-specific elements added
815
874
  - A list of validation messages
816
875
  """
817
- # copy the metadata fields dict from the host type config to be the
818
- # basis of the work-in-progress metadata dict--these are the default fields
819
- # that will be overwritten, if necessary, by sample type-specific fields
820
- wip_metadata_fields_dict = deepcopy_dict(
821
- a_host_type_config_dict.get(METADATA_FIELDS_KEY, {}))
822
-
823
876
  # get the config section for *all* sample types within this host type
824
877
  host_sample_types_config_dict = \
825
878
  a_host_type_config_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY]
@@ -837,20 +890,17 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
837
890
  update_metadata_df_field(
838
891
  sample_type_df, QC_NOTE_KEY, "invalid sample_type")
839
892
  else:
840
- # resolve any aliases and base types for the sample type and combine its
841
- # specific metadata fields with the host type's metadata fields
842
- # to get the full set of config info for this host+sample type
893
+ # Get the already-resolved metadata fields dict for this sample type.
894
+ # The config is pre-resolved: aliases/base types are merged and
895
+ # host metadata is combined.
896
+ sample_type_config = host_sample_types_config_dict[a_sample_type]
843
897
  full_sample_type_metadata_fields_dict = \
844
- _construct_sample_type_metadata_fields_dict(
845
- a_sample_type, host_sample_types_config_dict, wip_metadata_fields_dict)
898
+ sample_type_config.get(METADATA_FIELDS_KEY, {})
846
899
 
847
900
  # update the metadata df with the sample type specific metadata fields
848
- # TODO: this is taking in wip_metadata_fields_dict instead of full_sample_type_metadata_fields_dict,
849
- # which only works because the code underlying _construct_sample_type_metadata_fields_dict
850
- # is *modifying* wip_metadata_fields_dict in place. This should be corrected, but that
851
- # needs to wait until there are tests to make sure doing so doesn't break anything.
852
901
  sample_type_df = _update_metadata_from_dict(
853
- sample_type_df, wip_metadata_fields_dict, dict_is_metadata_fields=True,
902
+ sample_type_df, full_sample_type_metadata_fields_dict,
903
+ dict_is_metadata_fields=True,
854
904
  overwrite_non_nans=global_plus_host_settings_dict[OVERWRITE_NON_NANS_KEY])
855
905
 
856
906
  # for fields that are required but not yet filled, replace the placeholder with
@@ -27,6 +27,8 @@ SOURCES_KEY = "sources"
27
27
  FUNCTION_KEY = "function"
28
28
  LEAVE_REQUIREDS_BLANK_KEY = "leave_requireds_blank"
29
29
  OVERWRITE_NON_NANS_KEY = "overwrite_non_nans"
30
+ HOSTTYPE_COL_OPTIONS_KEY = "hosttype_column_options"
31
+ SAMPLETYPE_COL_OPTIONS_KEY = "sampletype_column_options"
30
32
 
31
33
  # internal code keys
32
34
  HOSTTYPE_SHORTHAND_KEY = "hosttype_shorthand"