metameq 2026.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metameq/__init__.py +42 -0
- metameq/_version.py +21 -0
- metameq/config/__init__.py +0 -0
- metameq/config/config.yml +3 -0
- metameq/config/standards.yml +1648 -0
- metameq/src/__init__.py +0 -0
- metameq/src/__main__.py +34 -0
- metameq/src/metadata_configurator.py +512 -0
- metameq/src/metadata_extender.py +1168 -0
- metameq/src/metadata_merger.py +362 -0
- metameq/src/metadata_transformers.py +335 -0
- metameq/src/metadata_validator.py +387 -0
- metameq/src/util.py +299 -0
- metameq/tests/__init__.py +0 -0
- metameq/tests/data/invalid.yml +1 -0
- metameq/tests/data/test_config.yml +9 -0
- metameq/tests/test_metadata_configurator.py +2334 -0
- metameq/tests/test_metadata_extender.py +2610 -0
- metameq/tests/test_metadata_merger.py +657 -0
- metameq/tests/test_metadata_transformers.py +277 -0
- metameq/tests/test_metadata_validator.py +1191 -0
- metameq/tests/test_util.py +436 -0
- metameq-2026.1.1.dist-info/METADATA +21 -0
- metameq-2026.1.1.dist-info/RECORD +27 -0
- metameq-2026.1.1.dist-info/WHEEL +5 -0
- metameq-2026.1.1.dist-info/entry_points.txt +2 -0
- metameq-2026.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,657 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas
|
|
3
|
+
from pandas.testing import assert_frame_equal
|
|
4
|
+
from unittest import TestCase
|
|
5
|
+
from metameq.src.metadata_merger import _check_for_nans, \
|
|
6
|
+
_check_for_duplicate_field_vals, _validate_merge, \
|
|
7
|
+
merge_many_to_one_metadata, merge_one_to_one_metadata, \
|
|
8
|
+
merge_sample_and_subject_metadata, find_common_col_names, \
|
|
9
|
+
find_common_df_cols
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestMetadataMerger(TestCase):
|
|
13
|
+
"""Test suite for metadata merging functions in metameq.src.metadata_merger module."""
|
|
14
|
+
|
|
15
|
+
# Tests for _check_for_nans
|
|
16
|
+
def test__check_for_nans_wo_nans(self):
|
|
17
|
+
"""Test checking for NaNs when no NaNs are present in the selected column.
|
|
18
|
+
|
|
19
|
+
Verifies that an empty list is returned when checking a column that
|
|
20
|
+
contains no NaN values, even if other columns contain NaNs.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
df = pandas.DataFrame({
|
|
24
|
+
"a": [1, 2, 3],
|
|
25
|
+
"b": [4, np.nan, 6]
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
obs = _check_for_nans(df, "test", "a")
|
|
29
|
+
self.assertEqual([], obs)
|
|
30
|
+
|
|
31
|
+
def test__check_for_nans_w_nans(self):
|
|
32
|
+
"""Test checking for NaNs when NaNs are present in the selected column.
|
|
33
|
+
|
|
34
|
+
Verifies that a list containing appropriate message string(s) is returned
|
|
35
|
+
when checking a column that contains NaN values.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
df = pandas.DataFrame({
|
|
39
|
+
"a": [1, np.nan, 3],
|
|
40
|
+
"b": [4, np.nan, 6]
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
obs = _check_for_nans(df, "test", "b")
|
|
44
|
+
self.assertEqual(["'test' metadata has NaNs in column 'b'"], obs)
|
|
45
|
+
|
|
46
|
+
def test__check_for_nans_with_empty(self):
|
|
47
|
+
"""Test that checking for NaNs in an empty DataFrame raises an error"""
|
|
48
|
+
empty_df = pandas.DataFrame()
|
|
49
|
+
with self.assertRaises(Exception):
|
|
50
|
+
_check_for_nans(empty_df, "test", "a")
|
|
51
|
+
|
|
52
|
+
# Tests for _check_for_duplicate_field_vals
|
|
53
|
+
def test__check_for_duplicate_field_vals(self):
|
|
54
|
+
"""Test checking for duplicate values when no duplicates are present.
|
|
55
|
+
|
|
56
|
+
Verifies that an empty list is returned when checking a column that
|
|
57
|
+
contains no duplicate values, even if other columns contain duplicates.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
df = pandas.DataFrame({
|
|
61
|
+
"a": [1, 2, 3],
|
|
62
|
+
"b": [4, 5, 5]
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
obs = _check_for_duplicate_field_vals(df, "test", "a")
|
|
66
|
+
self.assertEqual([], obs)
|
|
67
|
+
|
|
68
|
+
def test__check_for_duplicate_field_vals_w_duplicates(self):
|
|
69
|
+
"""Test checking for duplicate values when duplicates are present.
|
|
70
|
+
|
|
71
|
+
Verifies that a list containing appropriate message string(s) is returned
|
|
72
|
+
when checking a column that contains duplicate values.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
df = pandas.DataFrame({
|
|
76
|
+
"a": [1, 2, 2, 3, 1],
|
|
77
|
+
"b": [4, 5, 6, 6, 4]
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
obs = _check_for_duplicate_field_vals(df, "test", "a")
|
|
81
|
+
self.assertEqual(
|
|
82
|
+
["'test' metadata has duplicates of the following values in "
|
|
83
|
+
"column 'a': [1 2]"], obs)
|
|
84
|
+
|
|
85
|
+
def test_check_for_duplicate_field_vals_with_empty(self):
|
|
86
|
+
"""Test that checking for duplicate values in an empty DataFrame returns an empty list."""
|
|
87
|
+
df = pandas.DataFrame()
|
|
88
|
+
result = _check_for_duplicate_field_vals(df, "test", "col")
|
|
89
|
+
self.assertEqual(result, [])
|
|
90
|
+
|
|
91
|
+
# Tests for _validate_merge
|
|
92
|
+
def test__validate_merge(self):
|
|
93
|
+
"""Test validating merge operation with valid input data.
|
|
94
|
+
|
|
95
|
+
Verifies that no exceptions are raised when validating a merge operation
|
|
96
|
+
with valid input DataFrames and merge columns.
|
|
97
|
+
"""
|
|
98
|
+
left_df = pandas.DataFrame({
|
|
99
|
+
"id": ['x', 'y', 'z'],
|
|
100
|
+
"a": [1, 2, 3],
|
|
101
|
+
"b": [4, 5, 6]
|
|
102
|
+
})
|
|
103
|
+
right_df = pandas.DataFrame({
|
|
104
|
+
"name": ['x', 'y', 'z'],
|
|
105
|
+
"c": [7, 8, 9],
|
|
106
|
+
"d": [10, 11, 12]
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
_validate_merge(left_df, right_df, "a", "c")
|
|
110
|
+
self.assertTrue(True)
|
|
111
|
+
|
|
112
|
+
def test__validate_merge_err_left_col(self):
|
|
113
|
+
"""Test validating merge operation with missing left merge column.
|
|
114
|
+
|
|
115
|
+
Verifies that a ValueError is raised with an appropriate message when
|
|
116
|
+
the left DataFrame is missing the specified merge column.
|
|
117
|
+
"""
|
|
118
|
+
# test case 1: no errors
|
|
119
|
+
left_df = pandas.DataFrame({
|
|
120
|
+
"id": ['x', 'y', 'z'],
|
|
121
|
+
"a": [1, 2, 3],
|
|
122
|
+
"b": [4, 5, 6]
|
|
123
|
+
})
|
|
124
|
+
right_df = pandas.DataFrame({
|
|
125
|
+
"name": ['x', 'y', 'z'],
|
|
126
|
+
"c": [7, 8, 9],
|
|
127
|
+
"d": [10, 11, 12]
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
with self.assertRaisesRegex(
|
|
131
|
+
ValueError, r"left metadata missing merge column: \['c'\]"):
|
|
132
|
+
_validate_merge(left_df, right_df, "c", "c")
|
|
133
|
+
|
|
134
|
+
def test__validate_merge_err_right_col(self):
|
|
135
|
+
"""Test validating merge operation with missing right merge column.
|
|
136
|
+
|
|
137
|
+
Verifies that a ValueError is raised with an appropriate message when
|
|
138
|
+
the right DataFrame is missing the specified merge column.
|
|
139
|
+
"""
|
|
140
|
+
# test case 1: no errors
|
|
141
|
+
left_df = pandas.DataFrame({
|
|
142
|
+
"id": ['x', 'y', 'z'],
|
|
143
|
+
"a": [1, 2, 3],
|
|
144
|
+
"b": [4, 5, 6]
|
|
145
|
+
})
|
|
146
|
+
right_df = pandas.DataFrame({
|
|
147
|
+
"name": ['x', 'y', 'z'],
|
|
148
|
+
"c": [7, 8, 9],
|
|
149
|
+
"d": [10, 11, 12]
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
with self.assertRaisesRegex(
|
|
153
|
+
ValueError, r"right metadata missing merge column: \['a'\]"):
|
|
154
|
+
_validate_merge(left_df, right_df, "a", "a")
|
|
155
|
+
|
|
156
|
+
def test__validate_merge_err_msgs(self):
|
|
157
|
+
"""Test validating merge operation with multiple validation errors.
|
|
158
|
+
|
|
159
|
+
Verifies that a ValueError is raised with a comprehensive error message
|
|
160
|
+
when multiple validation issues are present (e.g., NaNs and duplicates) in
|
|
161
|
+
both DataFrames.
|
|
162
|
+
"""
|
|
163
|
+
# test case 1: no errors
|
|
164
|
+
left_df = pandas.DataFrame({
|
|
165
|
+
"id": ['x', np.nan, 'x'],
|
|
166
|
+
"a": [1, 2, 3],
|
|
167
|
+
"b": [4, 5, 6]
|
|
168
|
+
})
|
|
169
|
+
right_df = pandas.DataFrame({
|
|
170
|
+
"name": [np.nan, 'y', 'y'],
|
|
171
|
+
"c": [7, 8, 9],
|
|
172
|
+
"d": [10, 11, 12]
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
exp_msg = r"""Errors in metadata to merge:
|
|
176
|
+
'left' metadata has NaNs in column 'id'
|
|
177
|
+
'right' metadata has NaNs in column 'name'
|
|
178
|
+
'left' metadata has duplicates of the following values in column 'id': \['x'\]
|
|
179
|
+
'right' metadata has duplicates of the following values in column 'name': \['y'\]""" # noqa E501
|
|
180
|
+
|
|
181
|
+
with self.assertRaisesRegex(ValueError, exp_msg):
|
|
182
|
+
_validate_merge(left_df, right_df, "id", "name")
|
|
183
|
+
|
|
184
|
+
# I'm not going to test every variation of the merge_one_to_one_metadata
|
|
185
|
+
# join (left, right, inner, outer, etc.) because the pandas library is
|
|
186
|
+
# already well-tested. I'm just going to test one to show that the
|
|
187
|
+
# function's calling the pandas merge function with the correct parameters.
|
|
188
|
+
|
|
189
|
+
# Tests for merge_one_to_one_metadata
|
|
190
|
+
def test_merge_one_to_one_metadata_left(self):
|
|
191
|
+
"""Test one-to-one metadata merge with left join type.
|
|
192
|
+
|
|
193
|
+
Verifies that the merge operation correctly performs a left join,
|
|
194
|
+
preserving all rows from the left DataFrame and matching rows from
|
|
195
|
+
the right DataFrame.
|
|
196
|
+
"""
|
|
197
|
+
# test case 1: no errors
|
|
198
|
+
left_df = pandas.DataFrame({
|
|
199
|
+
"id": ['x', 'y', 'z'],
|
|
200
|
+
"a": [1, 2, 3],
|
|
201
|
+
"b": [4, 5, 6]
|
|
202
|
+
})
|
|
203
|
+
right_df = pandas.DataFrame({
|
|
204
|
+
"name": ['x', 'y', 'z', 'q'],
|
|
205
|
+
"c": [7, 8, 9, 90],
|
|
206
|
+
"d": [10, 11, 12, 120]
|
|
207
|
+
})
|
|
208
|
+
|
|
209
|
+
obs = merge_one_to_one_metadata(
|
|
210
|
+
left_df, right_df, "id", "name")
|
|
211
|
+
exp = pandas.DataFrame({
|
|
212
|
+
"id": ['x', 'y', 'z'],
|
|
213
|
+
"a": [1, 2, 3],
|
|
214
|
+
"b": [4, 5, 6],
|
|
215
|
+
"name": ['x', 'y', 'z'],
|
|
216
|
+
"c": [7, 8, 9],
|
|
217
|
+
"d": [10, 11, 12]
|
|
218
|
+
})
|
|
219
|
+
|
|
220
|
+
assert_frame_equal(obs, exp)
|
|
221
|
+
|
|
222
|
+
def test_merge_one_to_one_metadata_err(self):
|
|
223
|
+
"""Test one-to-one metadata merge with validation errors.
|
|
224
|
+
|
|
225
|
+
Verifies that appropriate errors are raised when attempting to merge
|
|
226
|
+
DataFrames with validation issues (e.g., NaNs in merge columns).
|
|
227
|
+
"""
|
|
228
|
+
# this doesn't test ALL the errors, just that errors can be thrown
|
|
229
|
+
left_df = pandas.DataFrame({
|
|
230
|
+
"id": ['x', 'y', 'z'],
|
|
231
|
+
"a": [1, 2, 3],
|
|
232
|
+
"b": [4, 5, 6]
|
|
233
|
+
})
|
|
234
|
+
right_df = pandas.DataFrame({
|
|
235
|
+
"name": ['x', np.nan, 'z'],
|
|
236
|
+
"c": [7, 8, 9],
|
|
237
|
+
"d": [10, 11, 12]
|
|
238
|
+
})
|
|
239
|
+
|
|
240
|
+
with self.assertRaisesRegex(
|
|
241
|
+
ValueError, r"Errors in metadata to merge:\n"
|
|
242
|
+
r"'second' metadata has NaNs in column 'name'"):
|
|
243
|
+
merge_one_to_one_metadata(
|
|
244
|
+
left_df, right_df, "id", "name",
|
|
245
|
+
set_name_left="first", set_name_right="second")
|
|
246
|
+
|
|
247
|
+
# Tests for merge_many_to_one_metadata
|
|
248
|
+
def test_merge_many_to_one_metadata(self):
|
|
249
|
+
"""Test many-to-one metadata merge operation.
|
|
250
|
+
|
|
251
|
+
Verifies that the merge operation correctly handles cases where multiple
|
|
252
|
+
rows in the left DataFrame map to a single row in the right DataFrame.
|
|
253
|
+
"""
|
|
254
|
+
# test case 1: no errors
|
|
255
|
+
left_df = pandas.DataFrame({
|
|
256
|
+
"id": [101, 102, 103, 104],
|
|
257
|
+
"name": ['x', 'y', 'z', 'x'],
|
|
258
|
+
"a": [1, 2, 3, 4],
|
|
259
|
+
"b": [5, 6, 7, 8]
|
|
260
|
+
})
|
|
261
|
+
right_df = pandas.DataFrame({
|
|
262
|
+
"name": ['x', 'y', 'z'],
|
|
263
|
+
"c": [9, 10, 11],
|
|
264
|
+
"d": [12, 13, 14]
|
|
265
|
+
})
|
|
266
|
+
|
|
267
|
+
obs = merge_many_to_one_metadata(
|
|
268
|
+
left_df, right_df, "name", "name")
|
|
269
|
+
exp = pandas.DataFrame({
|
|
270
|
+
"id": [101, 102, 103, 104],
|
|
271
|
+
"name": ['x', 'y', 'z', 'x'],
|
|
272
|
+
"a": [1, 2, 3, 4],
|
|
273
|
+
"b": [5, 6, 7, 8],
|
|
274
|
+
"c": [9, 10, 11, 9],
|
|
275
|
+
"d": [12, 13, 14, 12]
|
|
276
|
+
})
|
|
277
|
+
|
|
278
|
+
assert_frame_equal(obs, exp)
|
|
279
|
+
|
|
280
|
+
def test_merge_many_to_one_metadata_err(self):
|
|
281
|
+
"""Test many-to-one metadata merge with validation errors.
|
|
282
|
+
|
|
283
|
+
Verifies that appropriate errors are raised when attempting to merge
|
|
284
|
+
DataFrames with validation issues (e.g., NaNs in merge columns).
|
|
285
|
+
"""
|
|
286
|
+
# this doesn't test ALL the errors, just that errors can be thrown
|
|
287
|
+
left_df = pandas.DataFrame({
|
|
288
|
+
"id": [101, 102, 103, 104],
|
|
289
|
+
"name": ['x', 'y', 'z', 'x'],
|
|
290
|
+
"a": [1, 2, 3, 4],
|
|
291
|
+
"b": [5, 6, 7, 8]
|
|
292
|
+
})
|
|
293
|
+
right_df = pandas.DataFrame({
|
|
294
|
+
"name": ['x', 'y', np.nan],
|
|
295
|
+
"c": [9, 10, 11],
|
|
296
|
+
"d": [12, 13, 14]
|
|
297
|
+
})
|
|
298
|
+
|
|
299
|
+
with self.assertRaisesRegex(
|
|
300
|
+
ValueError, r"Errors in metadata to merge:\n'uno' metadata "
|
|
301
|
+
r"has NaNs in column 'name'"):
|
|
302
|
+
merge_many_to_one_metadata(
|
|
303
|
+
left_df, right_df, "name",
|
|
304
|
+
set_name_many="lots", set_name_one="uno")
|
|
305
|
+
|
|
306
|
+
# Tests for merge_sample_and_subject_metadata
|
|
307
|
+
def test_merge_sample_and_subject_metadata(self):
|
|
308
|
+
"""Test merging sample and subject metadata.
|
|
309
|
+
|
|
310
|
+
Verifies that the merge operation correctly combines sample metadata
|
|
311
|
+
with corresponding subject metadata based on matching identifiers.
|
|
312
|
+
"""
|
|
313
|
+
left_df = pandas.DataFrame({
|
|
314
|
+
"id": [101, 102, 103, 104],
|
|
315
|
+
"name": ['x', 'y', 'z', 'x'],
|
|
316
|
+
"a": [1, 2, 3, 4],
|
|
317
|
+
"b": [5, 6, 7, 8]
|
|
318
|
+
})
|
|
319
|
+
right_df = pandas.DataFrame({
|
|
320
|
+
"name": ['x', 'y', 'z'],
|
|
321
|
+
"c": [9, 10, 11],
|
|
322
|
+
"d": [12, 13, 14]
|
|
323
|
+
})
|
|
324
|
+
|
|
325
|
+
obs = merge_sample_and_subject_metadata(
|
|
326
|
+
left_df, right_df, "name")
|
|
327
|
+
exp = pandas.DataFrame({
|
|
328
|
+
"id": [101, 102, 103, 104],
|
|
329
|
+
"name": ['x', 'y', 'z', 'x'],
|
|
330
|
+
"a": [1, 2, 3, 4],
|
|
331
|
+
"b": [5, 6, 7, 8],
|
|
332
|
+
"c": [9, 10, 11, 9],
|
|
333
|
+
"d": [12, 13, 14, 12]
|
|
334
|
+
})
|
|
335
|
+
|
|
336
|
+
assert_frame_equal(obs, exp)
|
|
337
|
+
|
|
338
|
+
def test_merge_sample_and_subject_metadata_err(self):
|
|
339
|
+
"""Test merging sample and subject metadata with validation errors.
|
|
340
|
+
|
|
341
|
+
Verifies that appropriate errors are raised when attempting to merge
|
|
342
|
+
sample and subject metadata with validation issues (e.g., NaNs in
|
|
343
|
+
merge columns).
|
|
344
|
+
"""
|
|
345
|
+
left_df = pandas.DataFrame({
|
|
346
|
+
"id": [101, 102, 103, 104],
|
|
347
|
+
"name": ['x', 'y', 'z', 'x'],
|
|
348
|
+
"a": [1, 2, 3, 4],
|
|
349
|
+
"b": [5, 6, 7, 8]
|
|
350
|
+
})
|
|
351
|
+
right_df = pandas.DataFrame({
|
|
352
|
+
"name": ['x', 'y', np.nan],
|
|
353
|
+
"c": [9, 10, 11],
|
|
354
|
+
"d": [12, 13, 14]
|
|
355
|
+
})
|
|
356
|
+
|
|
357
|
+
with self.assertRaisesRegex(
|
|
358
|
+
ValueError, r"Errors in metadata to merge:\n'subject' metadata"
|
|
359
|
+
r" has NaNs in column 'name'"):
|
|
360
|
+
merge_sample_and_subject_metadata(
|
|
361
|
+
left_df, right_df, "name",)
|
|
362
|
+
|
|
363
|
+
# Tests for find_common_col_names
|
|
364
|
+
def test_find_common_col_names(self):
|
|
365
|
+
"""Test finding common column names between two lists"""
|
|
366
|
+
list1 = ['col1', 'col2', 'col3']
|
|
367
|
+
list2 = ['col2', 'col3', 'col4']
|
|
368
|
+
result = find_common_col_names(list1, list2)
|
|
369
|
+
self.assertEqual(result, ['col2', 'col3'])
|
|
370
|
+
|
|
371
|
+
def test_find_common_col_names_empty(self):
|
|
372
|
+
"""Test finding common column names with empty lists"""
|
|
373
|
+
result = find_common_col_names([], [])
|
|
374
|
+
self.assertEqual(result, [])
|
|
375
|
+
|
|
376
|
+
def test_find_common_col_names_no_common(self):
|
|
377
|
+
"""Test finding common column names with no common columns"""
|
|
378
|
+
list1 = ['col1', 'col2']
|
|
379
|
+
list2 = ['col3', 'col4']
|
|
380
|
+
result = find_common_col_names(list1, list2)
|
|
381
|
+
self.assertEqual(result, [])
|
|
382
|
+
|
|
383
|
+
def test_find_common_col_names_case_sensitive(self):
|
|
384
|
+
"""Test finding common column names with case sensitivity"""
|
|
385
|
+
list1 = ['Col1', 'col2']
|
|
386
|
+
list2 = ['col1', 'Col2']
|
|
387
|
+
result = find_common_col_names(list1, list2)
|
|
388
|
+
self.assertEqual(result, [])
|
|
389
|
+
|
|
390
|
+
# Tests for find_common_df_cols
|
|
391
|
+
def test_find_common_df_cols(self):
|
|
392
|
+
"""Test finding common columns between two DataFrames"""
|
|
393
|
+
df1 = pandas.DataFrame({
|
|
394
|
+
'col1': [1, 2],
|
|
395
|
+
'col2': [3, 4],
|
|
396
|
+
'col3': [5, 6]
|
|
397
|
+
})
|
|
398
|
+
df2 = pandas.DataFrame({
|
|
399
|
+
'col2': [7, 8],
|
|
400
|
+
'col3': [9, 10],
|
|
401
|
+
'col4': [11, 12]
|
|
402
|
+
})
|
|
403
|
+
|
|
404
|
+
result = find_common_df_cols(df1, df2)
|
|
405
|
+
self.assertEqual(result, ['col2', 'col3'])
|
|
406
|
+
|
|
407
|
+
def test_find_common_df_cols_empty(self):
|
|
408
|
+
"""Test finding common columns with empty DataFrames"""
|
|
409
|
+
df1 = pandas.DataFrame()
|
|
410
|
+
df2 = pandas.DataFrame()
|
|
411
|
+
result = find_common_df_cols(df1, df2)
|
|
412
|
+
self.assertEqual(result, [])
|
|
413
|
+
|
|
414
|
+
def test_find_common_df_cols_no_common(self):
|
|
415
|
+
"""Test finding common columns with no common columns"""
|
|
416
|
+
df1 = pandas.DataFrame({
|
|
417
|
+
'col1': [1, 2],
|
|
418
|
+
'col2': [3, 4]
|
|
419
|
+
})
|
|
420
|
+
|
|
421
|
+
df2 = pandas.DataFrame({
|
|
422
|
+
'col3': [5, 6],
|
|
423
|
+
'col4': [7, 8]
|
|
424
|
+
})
|
|
425
|
+
result = find_common_df_cols(df1, df2)
|
|
426
|
+
self.assertEqual(result, [])
|
|
427
|
+
|
|
428
|
+
# Tests for merge_one_to_one_metadata
|
|
429
|
+
def test_merge_one_to_one_metadata_right(self):
|
|
430
|
+
"""Test one-to-one metadata merge with right join type.
|
|
431
|
+
|
|
432
|
+
Verifies that the merge operation correctly performs a right join,
|
|
433
|
+
preserving all rows from the right DataFrame and matching rows from
|
|
434
|
+
the left DataFrame.
|
|
435
|
+
"""
|
|
436
|
+
left_df = pandas.DataFrame({
|
|
437
|
+
"id": ['x', 'y', 'z'],
|
|
438
|
+
"a": [1, 2, 3],
|
|
439
|
+
"b": [4, 5, 6]
|
|
440
|
+
})
|
|
441
|
+
right_df = pandas.DataFrame({
|
|
442
|
+
"name": ['x', 'y', 'z', 'q'],
|
|
443
|
+
"c": [7, 8, 9, 90],
|
|
444
|
+
"d": [10, 11, 12, 120]
|
|
445
|
+
})
|
|
446
|
+
|
|
447
|
+
obs = merge_one_to_one_metadata(
|
|
448
|
+
left_df, right_df, "id", "name", join_type='right')
|
|
449
|
+
exp = pandas.DataFrame({
|
|
450
|
+
"id": ['x', 'y', 'z', np.nan],
|
|
451
|
+
"a": [1, 2, 3, np.nan],
|
|
452
|
+
"b": [4, 5, 6, np.nan],
|
|
453
|
+
"name": ['x', 'y', 'z', 'q'],
|
|
454
|
+
"c": [7, 8, 9, 90],
|
|
455
|
+
"d": [10, 11, 12, 120]
|
|
456
|
+
})
|
|
457
|
+
|
|
458
|
+
assert_frame_equal(obs, exp)
|
|
459
|
+
|
|
460
|
+
def test_merge_one_to_one_metadata_inner(self):
|
|
461
|
+
"""Test one-to-one metadata merge with inner join type.
|
|
462
|
+
|
|
463
|
+
Verifies that the merge operation correctly performs an inner join,
|
|
464
|
+
only keeping rows where there are matches in both DataFrames.
|
|
465
|
+
"""
|
|
466
|
+
left_df = pandas.DataFrame({
|
|
467
|
+
"id": ['x', 'y', 'z'],
|
|
468
|
+
"a": [1, 2, 3],
|
|
469
|
+
"b": [4, 5, 6]
|
|
470
|
+
})
|
|
471
|
+
right_df = pandas.DataFrame({
|
|
472
|
+
"name": ['x', 'y', 'q'],
|
|
473
|
+
"c": [7, 8, 90],
|
|
474
|
+
"d": [10, 11, 120]
|
|
475
|
+
})
|
|
476
|
+
|
|
477
|
+
obs = merge_one_to_one_metadata(
|
|
478
|
+
left_df, right_df, "id", "name", join_type='inner')
|
|
479
|
+
exp = pandas.DataFrame({
|
|
480
|
+
"id": ['x', 'y'],
|
|
481
|
+
"a": [1, 2],
|
|
482
|
+
"b": [4, 5],
|
|
483
|
+
"name": ['x', 'y'],
|
|
484
|
+
"c": [7, 8],
|
|
485
|
+
"d": [10, 11]
|
|
486
|
+
})
|
|
487
|
+
|
|
488
|
+
assert_frame_equal(obs, exp)
|
|
489
|
+
|
|
490
|
+
def test_merge_one_to_one_metadata_with_nans(self):
|
|
491
|
+
"""Test one-to-one metadata merge with NaN values in merge columns.
|
|
492
|
+
|
|
493
|
+
Verifies that appropriate errors are raised when attempting to merge
|
|
494
|
+
DataFrames containing NaN values in the merge columns.
|
|
495
|
+
"""
|
|
496
|
+
left_df = pandas.DataFrame({
|
|
497
|
+
"id": ['x', 'y', 'z'],
|
|
498
|
+
"a": [1, 2, 3],
|
|
499
|
+
"b": [4, 5, 6]
|
|
500
|
+
})
|
|
501
|
+
right_df = pandas.DataFrame({
|
|
502
|
+
"id": ['x', 'y', np.nan],
|
|
503
|
+
"c": [7, 8, 90],
|
|
504
|
+
"d": [10, 11, 120]
|
|
505
|
+
})
|
|
506
|
+
|
|
507
|
+
with self.assertRaises(ValueError):
|
|
508
|
+
merge_one_to_one_metadata(left_df, right_df, 'id')
|
|
509
|
+
|
|
510
|
+
def test_merge_one_to_one_metadata_with_duplicates(self):
|
|
511
|
+
"""Test one-to-one metadata merge with duplicate values in merge columns.
|
|
512
|
+
|
|
513
|
+
Verifies that appropriate errors are raised when attempting to merge
|
|
514
|
+
DataFrames containing duplicate values in the merge columns.
|
|
515
|
+
"""
|
|
516
|
+
left_df = pandas.DataFrame({
|
|
517
|
+
"id": ['x', 'y', 'z'],
|
|
518
|
+
"a": [1, 2, 3],
|
|
519
|
+
"b": [4, 5, 6]
|
|
520
|
+
})
|
|
521
|
+
right_df = pandas.DataFrame({
|
|
522
|
+
"name": ['x', 'y', 'z', 'y'],
|
|
523
|
+
"c": [7, 8, 9, 90],
|
|
524
|
+
"d": [10, 11, 12, 120]
|
|
525
|
+
})
|
|
526
|
+
|
|
527
|
+
with self.assertRaises(ValueError):
|
|
528
|
+
merge_one_to_one_metadata(left_df, right_df, 'id', 'name')
|
|
529
|
+
|
|
530
|
+
# Tests for merge_many_to_one_metadata
|
|
531
|
+
def test_merge_many_to_one_metadata_with_duplicates(self):
|
|
532
|
+
"""Test many-to-one metadata merge with duplicate values in "many" merge column.
|
|
533
|
+
|
|
534
|
+
Verifies that the merge operation correctly handles cases where multiple
|
|
535
|
+
rows in the left DataFrame map to a single row in the right DataFrame,
|
|
536
|
+
including when there are duplicate values in the "many" merge column.
|
|
537
|
+
"""
|
|
538
|
+
left_df = pandas.DataFrame({
|
|
539
|
+
"id": [101, 102, 103, 104],
|
|
540
|
+
"name": ['x', 'y', 'z', 'x'],
|
|
541
|
+
"a": [1, 2, 3, 4],
|
|
542
|
+
"b": [5, 6, 7, 8]
|
|
543
|
+
})
|
|
544
|
+
right_df = pandas.DataFrame({
|
|
545
|
+
"name": ['x', 'y', 'z'],
|
|
546
|
+
"c": [9, 10, 11],
|
|
547
|
+
"d": [12, 13, 14]
|
|
548
|
+
})
|
|
549
|
+
|
|
550
|
+
obs = merge_many_to_one_metadata(
|
|
551
|
+
left_df, right_df, "name", "name")
|
|
552
|
+
exp = pandas.DataFrame({
|
|
553
|
+
"id": [101, 102, 103, 104],
|
|
554
|
+
"name": ['x', 'y', 'z', 'x'],
|
|
555
|
+
"a": [1, 2, 3, 4],
|
|
556
|
+
"b": [5, 6, 7, 8],
|
|
557
|
+
"c": [9, 10, 11, 9],
|
|
558
|
+
"d": [12, 13, 14, 12]
|
|
559
|
+
})
|
|
560
|
+
|
|
561
|
+
assert_frame_equal(obs, exp)
|
|
562
|
+
|
|
563
|
+
# Tests for merge_sample_and_subject_metadata
|
|
564
|
+
def test_merge_sample_and_subject_metadata_with_missing(self):
|
|
565
|
+
"""Test merging sample and subject metadata with missing matches.
|
|
566
|
+
|
|
567
|
+
Verifies that the merge operation correctly handles cases where some
|
|
568
|
+
samples do not have matching subject metadata, resulting in NaN values
|
|
569
|
+
for those samples.
|
|
570
|
+
"""
|
|
571
|
+
sample_df = pandas.DataFrame({
|
|
572
|
+
"id": [101, 102, 103, 104],
|
|
573
|
+
"name": ['x', 'y', 'z', 'w'],
|
|
574
|
+
"a": [1, 2, 3, 4],
|
|
575
|
+
"b": [5, 6, 7, 8]
|
|
576
|
+
})
|
|
577
|
+
subject_df = pandas.DataFrame({
|
|
578
|
+
"name": ['x', 'y', 'z'],
|
|
579
|
+
"c": [9, 10, 11],
|
|
580
|
+
"d": [12, 13, 14]
|
|
581
|
+
})
|
|
582
|
+
|
|
583
|
+
obs = merge_sample_and_subject_metadata(
|
|
584
|
+
sample_df, subject_df, "name")
|
|
585
|
+
exp = pandas.DataFrame({
|
|
586
|
+
"id": [101, 102, 103, 104],
|
|
587
|
+
"name": ['x', 'y', 'z', 'w'],
|
|
588
|
+
"a": [1, 2, 3, 4],
|
|
589
|
+
"b": [5, 6, 7, 8],
|
|
590
|
+
"c": [9, 10, 11, np.nan],
|
|
591
|
+
"d": [12, 13, 14, np.nan]
|
|
592
|
+
})
|
|
593
|
+
|
|
594
|
+
assert_frame_equal(obs, exp)
|
|
595
|
+
|
|
596
|
+
def test_validate_merge(self):
|
|
597
|
+
"""Test validating merge operation with valid input data.
|
|
598
|
+
|
|
599
|
+
Verifies that no exceptions are raised when validating a merge operation
|
|
600
|
+
with valid input DataFrames and merge columns, even when there are NaN
|
|
601
|
+
values in non-merge columns.
|
|
602
|
+
"""
|
|
603
|
+
left_df = pandas.DataFrame({
|
|
604
|
+
"id": ['x', 'y', 'z'],
|
|
605
|
+
"a": [1, 2, 3],
|
|
606
|
+
"b": [4, 5, 6]
|
|
607
|
+
})
|
|
608
|
+
right_df = pandas.DataFrame({
|
|
609
|
+
"name": ['x', 'y', 'z'],
|
|
610
|
+
"c": [7, 8, 9],
|
|
611
|
+
# NaN in non-merge column shouldn't matter
|
|
612
|
+
"d": [10, np.nan, 12]
|
|
613
|
+
})
|
|
614
|
+
|
|
615
|
+
_validate_merge(left_df, right_df, 'id', 'name')
|
|
616
|
+
# Should not raise any exception
|
|
617
|
+
|
|
618
|
+
def test_validate_merge_non_existent_col(self):
|
|
619
|
+
"""Test validating merge operation with non-existent merge column.
|
|
620
|
+
|
|
621
|
+
Verifies that a ValueError is raised when attempting to validate a merge
|
|
622
|
+
operation using a column that does not exist in one of the DataFrames.
|
|
623
|
+
"""
|
|
624
|
+
left_df = pandas.DataFrame({
|
|
625
|
+
"id": ['x', 'y', 'z'],
|
|
626
|
+
"a": [1, 2, 3],
|
|
627
|
+
"b": [4, 5, 6]
|
|
628
|
+
})
|
|
629
|
+
right_df = pandas.DataFrame({
|
|
630
|
+
"name": ['x', 'y', 'z'],
|
|
631
|
+
"c": [7, 8, 9],
|
|
632
|
+
# NaN in non-merge column shouldn't matter
|
|
633
|
+
"d": [10, np.nan, 12]
|
|
634
|
+
})
|
|
635
|
+
|
|
636
|
+
with self.assertRaises(ValueError):
|
|
637
|
+
_validate_merge(left_df, right_df, 'subject_id', 'name')
|
|
638
|
+
|
|
639
|
+
def test_validate_merge_with_nans(self):
|
|
640
|
+
"""Test validating merge operation with NaN values in merge columns.
|
|
641
|
+
|
|
642
|
+
Verifies that appropriate errors are raised when validating a merge
|
|
643
|
+
operation with NaN values in the merge columns of either DataFrame.
|
|
644
|
+
"""
|
|
645
|
+
left_df = pandas.DataFrame({
|
|
646
|
+
"id": ['x', np.nan, 'z'],
|
|
647
|
+
"a": [1, 2, 3],
|
|
648
|
+
"b": [4, 5, 6]
|
|
649
|
+
})
|
|
650
|
+
right_df = pandas.DataFrame({
|
|
651
|
+
"name": ['x', 'y', np.nan],
|
|
652
|
+
"c": [7, 8, 9],
|
|
653
|
+
"d": [10, 11, 12]
|
|
654
|
+
})
|
|
655
|
+
|
|
656
|
+
with self.assertRaisesRegex(ValueError, "Errors in metadata to merge"):
|
|
657
|
+
_validate_merge(left_df, right_df, "id", "name")
|