metameq 2026.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,387 @@
1
+ import cerberus
2
+ import copy
3
+ from datetime import datetime
4
+ from dateutil import parser
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+ from metameq.src.util import SAMPLE_NAME_KEY, get_extension
9
+
10
+ _TYPE_KEY = "type"
11
+ _ANYOF_KEY = "anyof"
12
+
13
+ # Define a logger for this module
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class MetameqValidator(cerberus.Validator):
18
+ """Custom cerberus Validator with metameq-specific validation rules.
19
+
20
+ Extends the cerberus Validator class to add custom check_with rules
21
+ for validating metadata fields according to metameq requirements.
22
+ Custom rules are invoked by including "check_with" in a field's
23
+ cerberus schema definition.
24
+
25
+ See Also
26
+ --------
27
+ https://docs.python-cerberus.org/customize.html
28
+ """
29
+
30
+ def _check_with_date_not_in_future(self, field, value):
31
+ """Validate that a date field value is not in the future.
32
+
33
+ This method is automatically invoked by cerberus when a field's schema
34
+ includes "check_with": "date_not_in_future". It parses the value as a
35
+ date and validates that it is not after the current date/time.
36
+
37
+ Parameters
38
+ ----------
39
+ field : str
40
+ The name of the field being validated.
41
+ value : str
42
+ The date string to validate.
43
+
44
+ Notes
45
+ -----
46
+ Adds a validation error if:
47
+ - The value cannot be parsed as a valid date
48
+ - The parsed date is in the future
49
+ """
50
+ # convert the field string to a date
51
+ try:
52
+ putative_date = parser.parse(value, fuzzy=True, dayfirst=False)
53
+ except Exception: # noqa: E722
54
+ self._error(field, "Must be a valid date")
55
+ return
56
+
57
+ if putative_date > datetime.now():
58
+ self._error(field, "Date cannot be in the future")
59
+
60
+
61
+ def validate_metadata_df(metadata_df, sample_type_full_metadata_fields_dict):
62
+ """Validate a metadata DataFrame against a field definition schema.
63
+
64
+ Converts the metadata fields dictionary into a cerberus schema, casts
65
+ each field in the DataFrame to its expected type, and validates all rows
66
+ against the schema. Fields defined in the schema but missing from the
67
+ DataFrame are logged and skipped.
68
+
69
+ Parameters
70
+ ----------
71
+ metadata_df : pandas.DataFrame
72
+ The metadata DataFrame to validate. Must contain a SAMPLE_NAME_KEY
73
+ columnfor identifying samples in validation error messages.
74
+ sample_type_full_metadata_fields_dict : dict
75
+ A dictionary defining metadata fields and their validation rules.
76
+ May contain metameq-specific keys (is_phi, field_desc, units,
77
+ min_exclusive, unique) which will be stripped before cerberus
78
+ validation, as well as standard cerberus keys (type, required,
79
+ allowed, regex, etc.).
80
+
81
+ Returns
82
+ -------
83
+ list
84
+ A list of dictionaries containing validation errors. Each dictionary
85
+ contains SAMPLE_NAME_KEY, "field_name", and "error_message" keys.
86
+ Returns an empty list if all rows pass validation.
87
+ """
88
+ config = _make_cerberus_schema(sample_type_full_metadata_fields_dict)
89
+
90
+ # NB: typed_metadata_df (the type-cast version of metadata_df) is only
91
+ # used for generating validation messages, after which it is discarded.
92
+ typed_metadata_df = metadata_df.copy()
93
+ for curr_field, curr_definition in \
94
+ sample_type_full_metadata_fields_dict.items():
95
+
96
+ if curr_field not in typed_metadata_df.columns:
97
+ logging.info(
98
+ f"Standard field {curr_field} not in metadata file")
99
+ continue
100
+
101
+ curr_allowed_types = _get_allowed_pandas_types(
102
+ curr_field, curr_definition)
103
+ typed_metadata_df[curr_field] = typed_metadata_df[curr_field].apply(
104
+ lambda x: _cast_field_to_type(x, curr_allowed_types))
105
+ # next field in config
106
+
107
+ validation_msgs = _generate_validation_msg(typed_metadata_df, config)
108
+ return validation_msgs
109
+
110
+
111
+ def output_validation_msgs(validation_msgs_df, out_dir, out_base, sep="\t",
112
+ suppress_empty_fails=False):
113
+ """Write validation messages to a timestamped file.
114
+
115
+ Outputs the validation messages DataFrame to a file with a timestamp prefix.
116
+ If the DataFrame is empty and suppress_empty_fails is False, creates an empty
117
+ file. If suppress_empty_fails is True and the DataFrame is empty, no file is
118
+ created.
119
+
120
+ Parameters
121
+ ----------
122
+ validation_msgs_df : pandas.DataFrame
123
+ DataFrame containing validation error messages.
124
+ out_dir : str
125
+ Directory where the output file will be written.
126
+ out_base : str
127
+ Base name for the output file. The full filename will be
128
+ "{timestamp}_{out_base}_validation_errors.{extension}".
129
+ sep : str, default="\t"
130
+ Separator to use in the output file. Determines file extension
131
+ (tab -> .txt, comma -> .csv).
132
+ suppress_empty_fails : bool, default=False
133
+ If True, no file is created when validation_msgs_df is empty.
134
+ If False, an empty file is created when there are no validation errors.
135
+ """
136
+ timestamp_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
137
+ extension = get_extension(sep)
138
+ out_fp = os.path.join(
139
+ out_dir, f"{timestamp_str}_{out_base}_validation_errors.{extension}")
140
+
141
+ if validation_msgs_df.empty:
142
+ if not suppress_empty_fails:
143
+ Path(out_fp).touch()
144
+ # else, just do nothing
145
+ else:
146
+ validation_msgs_df.to_csv(out_fp, sep=sep, index=False)
147
+
148
+
149
+ def _make_cerberus_schema(sample_type_metadata_dict):
150
+ """Convert a metadata fields dictionary into a cerberus-compatible validation schema.
151
+
152
+ Creates a deep copy of the input dictionary and removes keys that are not
153
+ recognized by the cerberus validation library (is_phi, field_desc, units,
154
+ min_exclusive, unique). The resulting dictionary can be used directly with
155
+ cerberus for validation.
156
+
157
+ Parameters
158
+ ----------
159
+ sample_type_metadata_dict : dict
160
+ A dictionary containing metadata field definitions, potentially including
161
+ keys that are not recognized by cerberus.
162
+
163
+ Returns
164
+ -------
165
+ dict
166
+ A cerberus-compatible schema with unrecognized keys removed.
167
+ """
168
+ unrecognized_keys = ['is_phi', 'field_desc', 'units',
169
+ 'min_exclusive', 'unique']
170
+ # traverse the host_fields_config dict and remove any keys that are not
171
+ # recognized by cerberus
172
+ cerberus_config = copy.deepcopy(sample_type_metadata_dict)
173
+ cerberus_config = _remove_leaf_keys_from_dict(
174
+ cerberus_config, unrecognized_keys)
175
+
176
+ return cerberus_config
177
+
178
+
179
+ def _remove_leaf_keys_from_dict(input_dict, keys_to_remove):
180
+ """Remove specified leaf keys from a dictionary, recursively processing nested structures.
181
+
182
+ Traverses the input dictionary and removes any keys with non-dict, non-list (leaf) values
183
+ that are in the keys_to_remove list. Keys with dict or list values are always
184
+ preserved (even if they match one of the keys_to_remove), with their contents recursively
185
+ processed. For lists, delegates to _remove_leaf_keys_from_dict_in_list.
186
+ Non-dict, non-list values are deep-copied if their key is not being removed.
187
+
188
+ Parameters
189
+ ----------
190
+ input_dict : dict
191
+ The dictionary to process.
192
+ keys_to_remove : list
193
+ List of key names to remove from the dictionary and any nested dicts.
194
+ Only keys with non-dict, non-list values will be removed.
195
+
196
+ Returns
197
+ -------
198
+ dict
199
+ A new dictionary with the specified leaf keys removed at all nesting levels.
200
+ """
201
+ output_dict = {}
202
+ for curr_key, curr_val in input_dict.items():
203
+ if isinstance(curr_val, dict):
204
+ output_dict[curr_key] = \
205
+ _remove_leaf_keys_from_dict(curr_val, keys_to_remove)
206
+ elif isinstance(curr_val, list):
207
+ output_dict[curr_key] = \
208
+ _remove_leaf_keys_from_dict_in_list(curr_val, keys_to_remove)
209
+ else:
210
+ if curr_key not in keys_to_remove:
211
+ output_dict[curr_key] = copy.deepcopy(curr_val)
212
+ return output_dict
213
+
214
+
215
+ def _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove):
216
+ """Remove specified leaf keys from all dictionaries within a list.
217
+
218
+ Recursively processes the input list and removes any keys with non-dict, non-list (leaf)
219
+ values that are in the keys_to_remove list from any dictionaries found using
220
+ _remove_leaf_keys_from_dict. Handles nested lists and dictionaries at any depth.
221
+ Non-dict, non-list items are preserved unchanged.
222
+
223
+ Parameters
224
+ ----------
225
+ input_list : list
226
+ The list to process. May contain dicts, nested lists, or other values.
227
+ keys_to_remove : list
228
+ List of key names to remove from any dictionaries found.
229
+ Only keys with non-dict, non-list values will be removed.
230
+
231
+ Returns
232
+ -------
233
+ list
234
+ A new list with the specified leaf keys removed from all contained dicts.
235
+ """
236
+ output_list = []
237
+ for curr_val in input_list:
238
+ if isinstance(curr_val, dict):
239
+ output_list.append(
240
+ _remove_leaf_keys_from_dict(curr_val, keys_to_remove))
241
+ elif isinstance(curr_val, list):
242
+ output_list.append(
243
+ _remove_leaf_keys_from_dict_in_list(curr_val, keys_to_remove))
244
+ else:
245
+ output_list.append(curr_val)
246
+ return output_list
247
+
248
+
249
+ def _cast_field_to_type(raw_field_val, allowed_pandas_types):
250
+ """Cast a field value to one of the allowed Python types.
251
+
252
+ Attempts to cast the raw field value to each type in allowed_pandas_types
253
+ in order, returning the first successful cast. This allows flexible type
254
+ coercion where a value might be validly interpreted as multiple types.
255
+
256
+ Parameters
257
+ ----------
258
+ raw_field_val : any
259
+ The raw value to cast.
260
+ allowed_pandas_types : list
261
+ A list of Python type callables (e.g., str, int, float) to attempt
262
+ casting to, in order of preference.
263
+
264
+ Returns
265
+ -------
266
+ any
267
+ The field value cast to the first successfully matched type.
268
+
269
+ Raises
270
+ ------
271
+ ValueError
272
+ If the value cannot be cast to any of the allowed types.
273
+ """
274
+ typed_field_val = None
275
+ for curr_type in allowed_pandas_types:
276
+ # noinspection PyBroadException
277
+ try:
278
+ typed_field_val = curr_type(raw_field_val)
279
+ break
280
+ except Exception: # noqa: E722
281
+ pass
282
+ # next allowed type
283
+
284
+ if typed_field_val is None:
285
+ raise ValueError(
286
+ f"Unable to cast '{raw_field_val}' to any of the allowed "
287
+ f"types: {allowed_pandas_types}")
288
+
289
+ return typed_field_val
290
+
291
+
292
+ def _get_allowed_pandas_types(field_name, field_definition):
293
+ """Extract allowed Python types from a cerberus field definition.
294
+
295
+ Reads the type specification from a cerberus field definition and converts
296
+ the cerberus type names to their corresponding Python types. Handles both
297
+ single-type definitions (using "type" key) and multiple-type definitions
298
+ (using "anyof" key with a list of type options).
299
+
300
+ Parameters
301
+ ----------
302
+ field_name : str
303
+ The name of the field being processed. Used only for error messages.
304
+ field_definition : dict
305
+ A cerberus field definition dictionary containing either a "type" key
306
+ with a single type name, or an "anyof" key with a list of type options.
307
+
308
+ Returns
309
+ -------
310
+ list
311
+ A list of Python type callables (str, int, float, bool, or datetime.date)
312
+ corresponding to the allowed cerberus types for this field.
313
+
314
+ Raises
315
+ ------
316
+ ValueError
317
+ If the field definition contains neither a "type" nor an "anyof" key.
318
+ """
319
+ cerberus_to_python_types = {
320
+ "string": str,
321
+ "integer": int,
322
+ "float": float,
323
+ "number": float,
324
+ "bool": bool,
325
+ "datetime": datetime.date}
326
+
327
+ allowed_cerberus_types = []
328
+ if _TYPE_KEY in field_definition:
329
+ allowed_cerberus_types.append(field_definition.get(_TYPE_KEY))
330
+ elif _ANYOF_KEY in field_definition:
331
+ for curr_allowed_type_entry in field_definition[_ANYOF_KEY]:
332
+ allowed_cerberus_types.append(
333
+ curr_allowed_type_entry[_TYPE_KEY])
334
+ # next anyof entry
335
+ else:
336
+ raise ValueError(
337
+ f"Unable to find type definition for field '{field_name}'")
338
+ # if type or anyof key in definition
339
+
340
+ allowed_pandas_types = \
341
+ [cerberus_to_python_types[x] for x in allowed_cerberus_types]
342
+ return allowed_pandas_types
343
+
344
+
345
+ def _generate_validation_msg(typed_metadata_df, config):
346
+ """Generate validation error messages for a metadata DataFrame.
347
+
348
+ Validates each row of the metadata DataFrame against the provided cerberus
349
+ schema configuration and collects any validation errors into a list of
350
+ dictionaries.
351
+
352
+ Parameters
353
+ ----------
354
+ typed_metadata_df : pandas.DataFrame
355
+ A metadata DataFrame with values already cast to their expected types.
356
+ Must contain a SAMPLE_NAME_KEY column for identifying samples.
357
+ config : dict
358
+ A cerberus-compatible validation schema dictionary defining the
359
+ validation rules for each metadata field.
360
+
361
+ Returns
362
+ -------
363
+ list
364
+ A list of dictionaries, where each dictionary contains:
365
+ - SAMPLE_NAME_KEY: The sample name for the row with the error
366
+ - "field_name": The name of the field that failed validation
367
+ - "error_message": The validation error message(s) from cerberus as a list of strings
368
+ Returns an empty list if all rows pass validation.
369
+ """
370
+ v = MetameqValidator()
371
+ v.allow_unknown = True
372
+
373
+ validation_msgs = []
374
+ raw_metadata_dict = typed_metadata_df.to_dict(orient="records")
375
+ for _, curr_row in enumerate(raw_metadata_dict):
376
+ if not v.validate(curr_row, config):
377
+ curr_sample_name = curr_row[SAMPLE_NAME_KEY]
378
+ for curr_field_name, curr_err_msg in v.errors.items():
379
+ validation_msgs.append({
380
+ SAMPLE_NAME_KEY: curr_sample_name,
381
+ "field_name": curr_field_name,
382
+ "error_message": curr_err_msg})
383
+ # next error for curr row
384
+ # endif row is not valid
385
+ # next row
386
+
387
+ return validation_msgs
metameq/src/util.py ADDED
@@ -0,0 +1,299 @@
1
+ import copy
2
+ from importlib.resources import files
3
+ import pandas
4
+ from typing import List, Optional, Union, Callable
5
+ import yaml
6
+
7
+ CONFIG_MODULE_PATH = "metameq.config"
8
+
9
+ # config keys
10
+ METADATA_FIELDS_KEY = "metadata_fields"
11
+ STUDY_SPECIFIC_METADATA_KEY = "study_specific_metadata"
12
+ HOST_TYPE_SPECIFIC_METADATA_KEY = "host_type_specific_metadata"
13
+ SAMPLE_TYPE_KEY = "sample_type"
14
+ QIITA_SAMPLE_TYPE = "qiita_sample_type"
15
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY = "sample_type_specific_metadata"
16
+ METADATA_TRANSFORMERS_KEY = "metadata_transformers"
17
+ PRE_TRANSFORMERS_KEY = "pre_transformers"
18
+ POST_TRANSFORMERS_KEY = "post_transformers"
19
+ ALIAS_KEY = "alias"
20
+ BASE_TYPE_KEY = "base_type"
21
+ DEFAULT_KEY = "default"
22
+ REQUIRED_KEY = "required"
23
+ ALLOWED_KEY = "allowed"
24
+ ANYOF_KEY = "anyof"
25
+ TYPE_KEY = "type"
26
+ SOURCES_KEY = "sources"
27
+ FUNCTION_KEY = "function"
28
+ LEAVE_REQUIREDS_BLANK_KEY = "leave_requireds_blank"
29
+ OVERWRITE_NON_NANS_KEY = "overwrite_non_nans"
30
+
31
+ # internal code keys
32
+ HOSTTYPE_SHORTHAND_KEY = "hosttype_shorthand"
33
+ SAMPLETYPE_SHORTHAND_KEY = "sampletype_shorthand"
34
+ QC_NOTE_KEY = "qc_note"
35
+
36
+ # metadata keys
37
+ SAMPLE_NAME_KEY = "sample_name"
38
+ COLLECTION_TIMESTAMP_KEY = "collection_timestamp"
39
+ HOST_SUBJECT_ID_KEY = "host_subject_id"
40
+
41
+ # constant field values
42
+ NOT_PROVIDED_VAL = "not provided"
43
+ LEAVE_BLANK_VAL = "leaveblank"
44
+ DO_NOT_USE_VAL = "donotuse"
45
+
46
+ # required raw metadata fields
47
+ REQUIRED_RAW_METADATA_FIELDS = [SAMPLE_NAME_KEY,
48
+ HOSTTYPE_SHORTHAND_KEY,
49
+ SAMPLETYPE_SHORTHAND_KEY]
50
+
51
+
52
+ def extract_config_dict(
53
+ config_fp: Union[str, None]) -> dict:
54
+ """Extract configuration dictionary from a YAML file.
55
+
56
+ If no config file path is provided, looks for config.yml in the grandparent
57
+ directory of the starting file path or current file.
58
+
59
+ Parameters
60
+ ----------
61
+ config_fp : Union[str, None]
62
+ Path to the configuration YAML file. If None, will look for config.yml
63
+ in the "config" module of the package.
64
+
65
+ Returns
66
+ -------
67
+ dict
68
+ Configuration dictionary loaded from the YAML file.
69
+
70
+ Raises
71
+ ------
72
+ FileNotFoundError
73
+ If the config file cannot be found.
74
+ yaml.YAMLError
75
+ If the YAML file is invalid.
76
+ """
77
+ if config_fp is None:
78
+ config_dir = files(CONFIG_MODULE_PATH)
79
+ config_fp = config_dir.joinpath("config.yml")
80
+
81
+ # read in config file
82
+ config_dict = extract_yaml_dict(config_fp)
83
+ return config_dict
84
+
85
+
86
+ def extract_yaml_dict(yaml_fp: str) -> dict:
87
+ """Extract dictionary from a YAML file.
88
+
89
+ Parameters
90
+ ----------
91
+ yaml_fp : str
92
+ Path to the YAML file.
93
+
94
+ Returns
95
+ -------
96
+ dict
97
+ Dictionary loaded from the YAML file.
98
+
99
+ Raises
100
+ ------
101
+ FileNotFoundError
102
+ If the YAML file cannot be found.
103
+ yaml.YAMLError
104
+ If the YAML file is invalid.
105
+ """
106
+ with open(yaml_fp, "r") as f:
107
+ yaml_dict = yaml.safe_load(f)
108
+ return yaml_dict
109
+
110
+
111
+ def extract_stds_config(stds_fp: Union[str, None]) -> dict:
112
+ """Extract standards dictionary from a YAML file.
113
+
114
+ If no standards file path is provided, looks for standards.yml in the
115
+ "config" module of the package.
116
+
117
+ Parameters
118
+ ----------
119
+ stds_fp : Union[str, None]
120
+ Path to the standards YAML file. If None, will look for
121
+ standards.yml in the "config" module.
122
+
123
+ Returns
124
+ -------
125
+ dict
126
+ Standards dictionary loaded from the YAML file.
127
+
128
+ Raises
129
+ ------
130
+ FileNotFoundError
131
+ If the standards file cannot be found.
132
+ yaml.YAMLError
133
+ If the YAML file is invalid.
134
+ """
135
+ if not stds_fp:
136
+ config_dir = files(CONFIG_MODULE_PATH)
137
+ stds_fp = config_dir.joinpath("standards.yml")
138
+ return extract_config_dict(stds_fp)
139
+
140
+
141
+ def deepcopy_dict(input_dict: dict) -> dict:
142
+ """Create a deep copy of a dictionary, including nested dictionaries.
143
+
144
+ Parameters
145
+ ----------
146
+ input_dict : dict
147
+ Dictionary to be copied.
148
+
149
+ Returns
150
+ -------
151
+ dict
152
+ Deep copy of the input dictionary.
153
+ """
154
+ output_dict = {}
155
+ for curr_key, curr_val in input_dict.items():
156
+ if isinstance(curr_val, dict):
157
+ output_dict[curr_key] = deepcopy_dict(curr_val)
158
+ else:
159
+ output_dict[curr_key] = copy.deepcopy(curr_val)
160
+ return output_dict
161
+
162
+
163
+ def load_df_with_best_fit_encoding(
164
+ an_fp: str, a_file_separator: str, dtype: Optional[str] = None) -> \
165
+ pandas.DataFrame:
166
+ """Load a DataFrame from a file, trying multiple encodings.
167
+
168
+ Attempts to load the file using various common encodings (utf-8, utf-8-sig,
169
+ iso-8859-1, latin1, cp1252) until successful.
170
+
171
+ Parameters
172
+ ----------
173
+ an_fp : str
174
+ Path to the file to load.
175
+ a_file_separator : str
176
+ Separator character used in the file (e.g., ',' for CSV).
177
+ dtype : Optional[str]
178
+ Data type to use for the DataFrame. If None, pandas will infer types.
179
+
180
+ Returns
181
+ -------
182
+ pandas.DataFrame
183
+ DataFrame loaded from the file.
184
+
185
+ Raises
186
+ ------
187
+ ValueError
188
+ If the file cannot be decoded with any of the available encodings.
189
+ """
190
+ result = None
191
+
192
+ # from https://stackoverflow.com/a/76366653
193
+ encodings = ["utf-8", "utf-8-sig", "iso-8859-1", "latin1", "cp1252"]
194
+ for encoding in encodings:
195
+ # noinspection PyBroadException
196
+ try:
197
+ result = pandas.read_csv(
198
+ an_fp, sep=a_file_separator, encoding=encoding, dtype=dtype)
199
+ break
200
+ except Exception: # noqa: E722
201
+ pass
202
+
203
+ if result is None:
204
+ raise ValueError(f"Unable to decode {an_fp} "
205
+ f"with any available encoder")
206
+
207
+ return result
208
+
209
+
210
+ def validate_required_columns_exist(
211
+ input_df: pandas.DataFrame, required_cols_list: List[str],
212
+ error_msg: str) -> None:
213
+ """Validate that a DataFrame contains all required columns.
214
+
215
+ Parameters
216
+ ----------
217
+ input_df : pandas.DataFrame
218
+ DataFrame to validate.
219
+ required_cols_list : List[str]
220
+ List of column names that must be present in the DataFrame.
221
+ error_msg : str
222
+ Error message to be raised if any required columns are missing.
223
+
224
+ Raises
225
+ ------
226
+ ValueError
227
+ If any of the required columns are missing from the DataFrame.
228
+ """
229
+ missing_cols = set(required_cols_list) - set(input_df.columns)
230
+ if len(missing_cols) > 0:
231
+ missing_cols = sorted(missing_cols)
232
+ raise ValueError(
233
+ f"{error_msg}: {missing_cols}")
234
+
235
+
236
+ def get_extension(sep: str) -> str:
237
+ """Get the appropriate file extension based on the separator character.
238
+
239
+ Parameters
240
+ ----------
241
+ sep : str
242
+ Separator character used in the file.
243
+
244
+ Returns
245
+ -------
246
+ str
247
+ File extension: 'csv' for comma-separated files, 'txt' for others.
248
+ """
249
+ return "csv" if sep == "," else "txt"
250
+
251
+
252
+ def update_metadata_df_field(
253
+ metadata_df: pandas.DataFrame, field_name: str,
254
+ field_val_or_func: Union[
255
+ str, Callable[[pandas.Series, List[str]], str]],
256
+ source_fields: Optional[List[str]] = None,
257
+ overwrite_non_nans: bool = True) -> None:
258
+ """Update or add a field in an existing metadata DataFrame.
259
+
260
+ Can update an existing field or add a new one, using either a constant value
261
+ or a function to compute values based on other fields.
262
+
263
+
264
+ Parameters
265
+ ----------
266
+ metadata_df : pandas.DataFrame
267
+ DataFrame to update. Modified in place.
268
+ field_name : str
269
+ Name of the field to update or add.
270
+ field_val_or_func : Union[str, Callable]
271
+ Either a constant value to set, or a function that takes a row and
272
+ source fields as input and returns a value.
273
+ source_fields : Optional[List[str]]
274
+ List of field names to use as input for the function. Required if
275
+ field_val_or_func is a function.
276
+ overwrite_non_nans : bool
277
+ If True, overwrites all values in the field. If False, only updates
278
+ NaN values.
279
+ """
280
+ # Note: function doesn't return anything. Work is done in-place on the
281
+ # metadata_df passed in.
282
+
283
+ # If the field does not already exist in the metadata OR if we have
284
+ # been told to overwrite existing (i.e., non-NaN) values, we will set its
285
+ # value in all rows; otherwise, will only set it where it is currently NaN
286
+ set_all = overwrite_non_nans or (field_name not in metadata_df.columns)
287
+ row_mask = \
288
+ metadata_df.index if set_all else metadata_df[field_name].isnull()
289
+
290
+ # If source fields were passed in, the field_val_or_func must be a function
291
+ if source_fields:
292
+ metadata_df.loc[row_mask, field_name] = \
293
+ metadata_df.apply(
294
+ lambda row: field_val_or_func(row, source_fields),
295
+ axis=1)
296
+ else:
297
+ # Otherwise, it is a constant value
298
+ metadata_df.loc[row_mask, field_name] = field_val_or_func
299
+ # endif using a function/a constant value
File without changes
@@ -0,0 +1 @@
1
+ invalid: yaml: content: - [
@@ -0,0 +1,9 @@
1
+ host_type_specific_metadata:
2
+ base:
3
+ metadata_fields:
4
+ sample_name:
5
+ type: string
6
+ unique: true
7
+ sample_type:
8
+ empty: false
9
+ is_phi: false