napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,304 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+
5
+ import logging
6
+ import re
7
+ from typing import Any
8
+ from typing import Iterable
9
+ from fs import open_fs
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from napistu import utils
14
+ from napistu import indices
15
+
16
+ from napistu.constants import SBML_DFS
17
+ from napistu.constants import IDENTIFIERS
18
+ from napistu.constants import BQB_DEFINING_ATTRS
19
+ from napistu.constants import BQB_DEFINING_ATTRS_LOOSE
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
25
+ """
26
+ Unnest Identifiers
27
+
28
+ Take a pd.DataFrame containing an array of Identifiers and
29
+ return one-row per identifier.
30
+
31
+ Parameters:
32
+ id_table: pd.DataFrame
33
+ a table containing an array of Identifiers
34
+ id_var: str
35
+ variable containing Identifiers
36
+
37
+ Returns:
38
+ pd.Dataframe containing the index of id_table but expanded
39
+ to include one row per identifier
40
+
41
+ """
42
+
43
+ # validate inputs
44
+ utils.match_pd_vars(id_table, {id_var}).assert_present()
45
+
46
+ N_invalid_ids = sum(id_table[id_var].isna())
47
+ if N_invalid_ids != 0:
48
+ raise ValueError(
49
+ f'{N_invalid_ids} entries in "id_table" were missing',
50
+ "entries with no identifiers should still include an Identifiers object",
51
+ )
52
+
53
+ # Get the identifier as a list of dicts
54
+ df = id_table[id_var].apply(lambda x: x.ids if len(x.ids) > 0 else 0).to_frame()
55
+ # Filter out zero length lists
56
+ df = df.query(f"{id_var} != 0")
57
+ # Unnest the list of dicts into one dict per row
58
+ df = df.explode(id_var)
59
+ # Unnest the dict into a dataframe
60
+ df = pd.DataFrame(df[id_var].values.tolist(), index=df.index)
61
+ # Add the entry number as an index
62
+ df["entry"] = df.groupby(df.index).cumcount()
63
+ df.set_index("entry", append=True, inplace=True)
64
+ return df
65
+
66
+
67
+ def id_formatter(id_values: Iterable[Any], id_type: str, id_len: int = 8) -> list[str]:
68
+ id_prefix = utils.extract_regex_match("^([a-zA-Z]+)_id$", id_type).upper()
69
+ return [id_prefix + format(x, f"0{id_len}d") for x in id_values]
70
+
71
+
72
+ def id_formatter_inv(ids: list[str]) -> list[int]:
73
+ """
74
+ ID Formatter Inverter
75
+
76
+ Convert from internal IDs back to integer IDs
77
+ """
78
+
79
+ id_val = list()
80
+ for an_id in ids:
81
+ if re.match("^[A-Z]+[0-9]+$", an_id):
82
+ id_val.append(int(re.sub("^[A-Z]+", "", an_id)))
83
+ else:
84
+ id_val.append(np.nan) # type: ignore
85
+
86
+ return id_val
87
+
88
+
89
+ def get_current_max_id(sbml_dfs_table: pd.DataFrame) -> int:
90
+ """
91
+ Get Current Max ID
92
+
93
+ Look at a table from an SBML_dfs object and find the largest primary key following
94
+ the default naming convention for a the table.
95
+
96
+ Params:
97
+ sbml_dfs_table (pd.DataFrame):
98
+ A table derived from an SBML_dfs object.
99
+
100
+ Returns:
101
+ current_max_id (int):
102
+ The largest id which is already defined in the table using its expected naming
103
+ convention. If no IDs following this convention are present then the default
104
+ will be -1. In this way new IDs will be added starting with 0.
105
+
106
+ """
107
+
108
+ existing_ids_numeric = id_formatter_inv(sbml_dfs_table.index.tolist())
109
+
110
+ # filter np.nan which will be introduced if the key is not the default format
111
+ existing_ids_numeric_valid = [x for x in existing_ids_numeric if x is not np.nan]
112
+ if len(existing_ids_numeric_valid) == 0:
113
+ current_max_id = -1
114
+ else:
115
+ current_max_id = max(existing_ids_numeric_valid)
116
+
117
+ return current_max_id
118
+
119
+
120
+ def adapt_pw_index(
121
+ source: str | indices.PWIndex,
122
+ species: str | Iterable[str] | None,
123
+ outdir: str | None = None,
124
+ ) -> indices.PWIndex:
125
+ """Adapts a pw_index
126
+
127
+ Helpful to filter for species before reconstructing.
128
+
129
+ Args:
130
+ source (str | PWIndex): uri for pw_index.csv file or PWIndex object
131
+ species (str):
132
+ outdir (str | None, optional): Optional directory to write pw_index to.
133
+ Defaults to None.
134
+
135
+ Returns:
136
+ indices.PWIndex: Filtered pw index
137
+ """
138
+ if isinstance(source, str):
139
+ pw_index = indices.PWIndex(source)
140
+ elif isinstance(source, indices.PWIndex):
141
+ pw_index = copy.deepcopy(source)
142
+ else:
143
+ raise ValueError("'source' needs to be str or PWIndex.")
144
+ pw_index.filter(species=species)
145
+
146
+ if outdir is not None:
147
+ with open_fs(outdir, create=True) as fs:
148
+ with fs.open("pw_index.tsv", "w") as f:
149
+ pw_index.index.to_csv(f, sep="\t")
150
+ return pw_index
151
+
152
+
153
+ def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
154
+ assert isinstance(dogmatic, bool)
155
+ if dogmatic:
156
+ logger.info(
157
+ "Running in dogmatic mode - differences genes, transcripts, and proteins will "
158
+ "try to be maintained as separate species."
159
+ )
160
+ # preserve differences between genes, transcripts, and proteins
161
+ defining_biological_qualifiers = BQB_DEFINING_ATTRS
162
+ else:
163
+ logger.info(
164
+ "Running in non-dogmatic mode - genes, transcripts, and proteins will "
165
+ "be merged if possible."
166
+ )
167
+ # merge genes, transcripts, and proteins (if they are defined with
168
+ # bqb terms which specify their relationships).
169
+ defining_biological_qualifiers = BQB_DEFINING_ATTRS_LOOSE
170
+
171
+ return defining_biological_qualifiers
172
+
173
+
174
+ def match_entitydata_index_to_entity(
175
+ entity_data_dict: dict,
176
+ an_entity_data_type: str,
177
+ consensus_entity_df: pd.DataFrame,
178
+ entity_schema: dict,
179
+ table: str,
180
+ ) -> pd.DataFrame:
181
+ """
182
+ Match the index of entity_data_dict[an_entity_data_type] with the index of corresponding entity.
183
+ Update entity_data_dict[an_entity_data_type]'s index to the same as consensus_entity_df's index
184
+ Report cases where entity_data has indices not in corresponding entity's index.
185
+ Args
186
+ entity_data_dict (dict): dictionary containing all model's "an_entity_data_type" dictionaries
187
+ an_entity_data_type (str): data_type from species/reactions_data in entity_data_dict
188
+ consensus_entity_df (pd.DataFrame): the dataframe of the corresponding entity
189
+ entity_schema (dict): schema for "table"
190
+ table (str): table whose data is being consolidates (currently species or reactions)
191
+ Returns:
192
+ entity_data_df (pd.DataFrame) table for entity_data_dict[an_entity_data_type]
193
+ """
194
+
195
+ data_table = table + "_data"
196
+ entity_data_df = entity_data_dict[an_entity_data_type]
197
+
198
+ # ensure entity_data_df[an_entity_data_type]'s index doesn't have
199
+ # reaction ids that are not in consensus_entity's index
200
+ if len(entity_data_df.index.difference(consensus_entity_df.index)) == 0:
201
+ logger.info(f"{data_table} ids are included in {table} ids")
202
+ else:
203
+ logger.warnning(
204
+ f"{data_table} have ids are not matched to {table} ids,"
205
+ f"please check mismatched ids first"
206
+ )
207
+
208
+ # when entity_data_df is only a subset of the index of consensus_entity_df
209
+ # add ids only in consensus_entity_df to entity_data_df, and fill values with Nan
210
+ if len(entity_data_df) != len(consensus_entity_df):
211
+ logger.info(
212
+ f"The {data_table} has {len(entity_data_df)} ids,"
213
+ f"different from {len(consensus_entity_df)} ids in the {table} table,"
214
+ f"updating {data_table} ids."
215
+ )
216
+
217
+ entity_data_df = pd.concat(
218
+ [
219
+ entity_data_df,
220
+ consensus_entity_df[
221
+ ~consensus_entity_df.index.isin(entity_data_df.index)
222
+ ],
223
+ ],
224
+ ignore_index=False,
225
+ )
226
+
227
+ entity_data_df.drop(entity_schema["vars"], axis=1, inplace=True)
228
+
229
+ return entity_data_df
230
+
231
+
232
+ def check_entity_data_index_matching(sbml_dfs, table):
233
+ """
234
+ Update the input smbl_dfs's entity_data (dict) index
235
+ with match_entitydata_index_to_entity,
236
+ so that index for dataframe(s) in entity_data (dict) matches the sbml_dfs'
237
+ corresponding entity, and then passes sbml_dfs.validate()
238
+ Args
239
+ sbml_dfs (cpr.SBML_dfs): a cpr.SBML_dfs
240
+ table (str): table whose data is being consolidates (currently species or reactions)
241
+ Returns
242
+ sbml_dfs (cpr.SBML_dfs):
243
+ sbml_dfs whose entity_data is checked to have the same index
244
+ as the corresponding entity.
245
+ """
246
+
247
+ table_data = table + "_data"
248
+
249
+ entity_data_dict = getattr(sbml_dfs, table_data)
250
+ entity_schema = sbml_dfs.schema[table]
251
+ sbml_dfs_entity = getattr(sbml_dfs, table)
252
+
253
+ if entity_data_dict != {}:
254
+ entity_data_types = set.union(set(entity_data_dict.keys()))
255
+
256
+ entity_data_dict_checked = {
257
+ x: match_entitydata_index_to_entity(
258
+ entity_data_dict, x, sbml_dfs_entity, entity_schema, table
259
+ )
260
+ for x in entity_data_types
261
+ }
262
+
263
+ if table == SBML_DFS.REACTIONS:
264
+ sbml_dfs.reactions_data = entity_data_dict_checked
265
+ elif table == SBML_DFS.SPECIES:
266
+ sbml_dfs.species_data = entity_data_dict_checked
267
+
268
+ return sbml_dfs
269
+
270
+
271
+ def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
272
+ assert isinstance(dogmatic, bool)
273
+ if dogmatic:
274
+ logger.info(
275
+ "Running in dogmatic mode - differences genes, transcripts, and proteins will "
276
+ "try to be maintained as separate species."
277
+ )
278
+ # preserve differences between genes, transcripts, and proteins
279
+ defining_biological_qualifiers = BQB_DEFINING_ATTRS
280
+ else:
281
+ logger.info(
282
+ "Running in non-dogmatic mode - genes, transcripts, and proteins will "
283
+ "be merged if possible."
284
+ )
285
+ # merge genes, transcripts, and proteins (if they are defined with
286
+ # bqb terms which specify their relationships).
287
+ defining_biological_qualifiers = BQB_DEFINING_ATTRS_LOOSE
288
+
289
+ return defining_biological_qualifiers
290
+
291
+
292
+ def _stub_ids(ids):
293
+ """Stub with a blank ID if an ids list is blank; otherwise create an Identifiers object from the provided ids"""
294
+ if len(ids) == 0:
295
+ return pd.DataFrame(
296
+ {
297
+ IDENTIFIERS.ONTOLOGY: [None],
298
+ IDENTIFIERS.IDENTIFIER: [None],
299
+ IDENTIFIERS.URL: [None],
300
+ IDENTIFIERS.BQB: [None],
301
+ }
302
+ )
303
+ else:
304
+ return pd.DataFrame(ids)
napistu/source.py ADDED
@@ -0,0 +1,394 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+ from napistu import indices
5
+ from napistu.constants import SOURCE_SPEC
6
+
7
+
8
+ class Source:
9
+ """
10
+ An Entity's Source
11
+
12
+ Attributes
13
+ ----------
14
+ source : pd.DataFrame
15
+ A dataframe containing the model source and other optional variables
16
+
17
+ Methods
18
+ -------
19
+
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ source_df: pd.DataFrame | None = None,
25
+ init: bool = False,
26
+ pw_index: indices.PWIndex | None = None,
27
+ ) -> None:
28
+ """
29
+ Tracks the model(s) an entity (i.e., a compartment, species, reaction) came from.
30
+
31
+ By convention sources exist only for the models that an entity came from rather
32
+ than the current model they are part of. For example, when combining Reactome models
33
+ into a consensus, a molecule which existed in multiple models would have a source entry
34
+ for each, but it would not have a source entry for the consensus model itself.
35
+
36
+ Parameters
37
+ ----------
38
+ source_df : pd.DataFrame
39
+ A dataframe containing the model source and other optional variables
40
+ init : bool
41
+ Creates an empty source object. This is typically used when creating an SBML_dfs
42
+ object from a single source.
43
+ pw_index : indices.PWIndex
44
+
45
+ Returns
46
+ -------
47
+ None.
48
+
49
+ """
50
+
51
+ if init is True:
52
+ # initialize with an empty Source
53
+ self.source = None
54
+ else:
55
+ if isinstance(source_df, pd.DataFrame):
56
+ # if pw_index is provided then it will be joined to source_df to add additional metadata
57
+ if pw_index is not None:
58
+ if not isinstance(pw_index, indices.PWIndex):
59
+ raise ValueError(
60
+ f"pw_index must be a indices.PWIndex or None and was {type(pw_index).__name__}"
61
+ )
62
+ else:
63
+ # check that all models are present in the pathway index
64
+ missing_pathways = set(
65
+ source_df[SOURCE_SPEC.MODEL].tolist()
66
+ ).difference(
67
+ set(pw_index.index[SOURCE_SPEC.PATHWAY_ID].tolist())
68
+ )
69
+ if len(missing_pathways) > 0:
70
+ raise ValueError(
71
+ f"{len(missing_pathways)} pathway models are present"
72
+ f" in source_df but not the pw_index: {', '.join(missing_pathways)}"
73
+ )
74
+
75
+ source_df = source_df.merge(
76
+ pw_index.index,
77
+ left_on=SOURCE_SPEC.MODEL,
78
+ right_on=SOURCE_SPEC.PATHWAY_ID,
79
+ )
80
+
81
+ self.source = source_df
82
+ else:
83
+ raise TypeError(
84
+ 'source_df must be a pd.DataFrame if "init" is False, but was type '
85
+ f"{type(source_df).__name__}"
86
+ )
87
+
88
+ if SOURCE_SPEC.MODEL not in source_df.columns.values.tolist():
89
+ raise ValueError(
90
+ f"{SOURCE_SPEC.MODEL} variable was not found, but is required in a Source object"
91
+ )
92
+ if SOURCE_SPEC.PATHWAY_ID not in source_df.columns.values.tolist():
93
+ raise ValueError(
94
+ f"{SOURCE_SPEC.PATHWAY_ID} variable was not found, but is required in a Source object"
95
+ )
96
+
97
+
98
+ def create_source_table(
99
+ lookup_table: pd.Series, table_schema: dict, pw_index: indices.PWIndex | None
100
+ ) -> pd.DataFrame:
101
+ """
102
+ Create Source Table
103
+
104
+ Create a table with one row per "new_id" and a Source object created from the union
105
+ of "old_id" Source objects
106
+ """
107
+
108
+ if SOURCE_SPEC.SOURCE not in table_schema.keys():
109
+ raise ValueError(
110
+ f"{SOURCE_SPEC.SOURCE} not present in schema, can't create source_table"
111
+ )
112
+
113
+ # take lookup_table and create an index on "new_id". Multiple rows may have the
114
+ # same value for new_id so these are grouped together.
115
+ lookup_table_rearranged = lookup_table.reset_index().set_index(["new_id"])
116
+
117
+ # run a list comprehension over each value of new_id to create a Source
118
+ # object based on the dataframe specific to new_id
119
+ # pw_index is provided to fill out additional meta-information beyond the
120
+ # pathway_id which defines a single source
121
+ def create_source(group):
122
+ return Source(
123
+ group.reset_index(drop=True),
124
+ pw_index=pw_index,
125
+ )
126
+
127
+ id_table = (
128
+ lookup_table_rearranged.groupby("new_id")
129
+ .apply(create_source)
130
+ .rename(table_schema[SOURCE_SPEC.SOURCE])
131
+ .to_frame()
132
+ )
133
+
134
+ id_table.index = id_table.index.rename(table_schema["pk"])
135
+
136
+ return id_table
137
+
138
+
139
+ def merge_sources(source_list: list | pd.Series) -> Source:
140
+ """
141
+ Merge Sources
142
+
143
+ Merge a list of Source objects into a single Source object
144
+
145
+ """
146
+
147
+ # filter to non-empty sources
148
+ # empty sources have only been initialized; a merge hasn't occured
149
+ existing_sources = [s.source is not None for s in source_list]
150
+ if not any(existing_sources):
151
+ if isinstance(source_list, list):
152
+ return source_list[0]
153
+ else:
154
+ return source_list.iloc[0]
155
+
156
+ existing_source_list = [
157
+ x.source for x, y in zip(source_list, existing_sources) if y
158
+ ]
159
+
160
+ return Source(pd.concat(existing_source_list))
161
+
162
+
163
+ def unnest_sources(
164
+ source_table: pd.DataFrame, source_var: str, verbose: bool = False
165
+ ) -> pd.DataFrame:
166
+ """
167
+ Unnest Sources
168
+
169
+ Take a pd.DataFrame containing an array of Sources and
170
+ return one-row per source.
171
+
172
+ Parameters:
173
+ source_table: pd.DataFrame
174
+ a table containing an array of Sources
175
+ source_var: str
176
+ variable containing Sources
177
+
178
+ Returns:
179
+ pd.Dataframe containing the index of source_table but expanded
180
+ to include one row per source
181
+
182
+ """
183
+
184
+ sources = list()
185
+ source_table_index = source_table.index.to_frame().reset_index(drop=True)
186
+
187
+ for i in range(source_table.shape[0]):
188
+ if verbose:
189
+ print(f"Processing {source_table_index.index.values[i]}")
190
+
191
+ # check that the entries of sourcevar are Source objects
192
+ source_value = source_table[source_var].iloc[i]
193
+
194
+ assert isinstance(source_value, Source)
195
+
196
+ if source_value.source is None:
197
+ print("Some sources were only missing - returning None")
198
+ return None
199
+
200
+ source_tbl = pd.DataFrame(source_value.source)
201
+ source_tbl.index.name = SOURCE_SPEC.INDEX_NAME
202
+ source_tbl = source_tbl.reset_index()
203
+
204
+ # add original index as variables and then set index
205
+ for j in range(source_table_index.shape[1]):
206
+ source_tbl[source_table_index.columns[j]] = source_table_index.iloc[i, j]
207
+ source_tbl = source_tbl.set_index(
208
+ list(source_table_index.columns) + [SOURCE_SPEC.INDEX_NAME]
209
+ )
210
+
211
+ sources.append(source_tbl)
212
+
213
+ return pd.concat(sources)
214
+
215
+
216
+ def greedy_set_coverge_of_sources(
217
+ source_df: pd.DataFrame, table_schema: dict
218
+ ) -> pd.DataFrame:
219
+ """
220
+ Greedy Set Coverage of Sources
221
+
222
+ Apply the greedy set coverge algorithm to find the minimal set of
223
+ sources which cover all entries
224
+
225
+ Parameters:
226
+ source_df: pd.DataFrame
227
+ pd.Dataframe containing the index of source_table but expanded to
228
+ include one row per source. As produced by source.unnest_sources()
229
+
230
+ Returns:
231
+ minimial_sources: [str]
232
+ A list of pathway_ids of the minimal source set
233
+
234
+ """
235
+
236
+ # rollup pathways with identical membership
237
+ deduplicated_sources = _deduplicate_source_df(source_df, table_schema)
238
+
239
+ unaccounted_for_members = deduplicated_sources
240
+ retained_pathway_ids = []
241
+
242
+ while unaccounted_for_members.shape[0] != 0:
243
+ # find the pathway with the most members
244
+ pathway_members = unaccounted_for_members.groupby(SOURCE_SPEC.PATHWAY_ID).size()
245
+ top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
246
+ retained_pathway_ids.append(top_pathway)
247
+
248
+ # remove all members associated with the top pathway
249
+ members_captured = (
250
+ unaccounted_for_members[
251
+ unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
252
+ ]
253
+ .index.get_level_values(table_schema["pk"])
254
+ .tolist()
255
+ )
256
+
257
+ unaccounted_for_members = unaccounted_for_members[
258
+ ~unaccounted_for_members.index.get_level_values(table_schema["pk"]).isin(
259
+ members_captured
260
+ )
261
+ ]
262
+
263
+ minimial_sources = deduplicated_sources[
264
+ deduplicated_sources[SOURCE_SPEC.PATHWAY_ID].isin(retained_pathway_ids)
265
+ ].sort_index()
266
+
267
+ return minimial_sources
268
+
269
+
270
+ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.DataFrame:
271
+ """Combine entries in a source table when multiple models have the same members."""
272
+
273
+ # drop entries which are missing required attributes and throw an error if none are left
274
+ REQUIRED_NON_NA_ATTRIBUTES = [SOURCE_SPEC.PATHWAY_ID]
275
+ indexed_sources = (
276
+ source_df.reset_index()
277
+ .merge(source_df[REQUIRED_NON_NA_ATTRIBUTES].dropna())
278
+ .set_index(SOURCE_SPEC.PATHWAY_ID)
279
+ )
280
+
281
+ if indexed_sources.shape[0] == 0:
282
+ raise ValueError(
283
+ f"source_df was provided but zero entries had a defined {' OR '.join(REQUIRED_NON_NA_ATTRIBUTES)}"
284
+ )
285
+
286
+ pathways = indexed_sources.index.unique()
287
+
288
+ # identify pathways with identical coverage
289
+
290
+ pathway_member_string = (
291
+ pd.DataFrame(
292
+ [
293
+ {
294
+ SOURCE_SPEC.PATHWAY_ID: p,
295
+ "membership_string": "_".join(
296
+ set(indexed_sources.loc[[p]][table_schema["pk"]].tolist())
297
+ ),
298
+ }
299
+ for p in pathways
300
+ ]
301
+ )
302
+ .drop_duplicates()
303
+ .set_index("membership_string")
304
+ )
305
+
306
+ membership_categories = pathway_member_string.merge(
307
+ source_df.groupby(SOURCE_SPEC.PATHWAY_ID).first(),
308
+ left_on=SOURCE_SPEC.PATHWAY_ID,
309
+ right_index=True,
310
+ )
311
+
312
+ category_index = membership_categories.index.unique()
313
+ assert isinstance(category_index, pd.core.indexes.base.Index)
314
+
315
+ merged_sources = pd.concat(
316
+ [
317
+ _collapse_by_membership_string(s, membership_categories, table_schema) # type: ignore
318
+ for s in category_index.tolist()
319
+ ]
320
+ )
321
+ merged_sources[SOURCE_SPEC.INDEX_NAME] = merged_sources.groupby(
322
+ table_schema["pk"]
323
+ ).cumcount()
324
+
325
+ return merged_sources.set_index(
326
+ [table_schema["pk"], SOURCE_SPEC.INDEX_NAME]
327
+ ).sort_index()
328
+
329
+
330
+ def _collapse_by_membership_string(
331
+ membership_string: str, membership_categories: pd.DataFrame, table_schema: dict
332
+ ) -> pd.DataFrame:
333
+ """Assign each member of a membership-string to a set of pathways."""
334
+
335
+ collapsed_source_membership = _collapse_source_df(
336
+ membership_categories.loc[membership_string]
337
+ )
338
+
339
+ return pd.DataFrame(
340
+ [
341
+ pd.concat(
342
+ [pd.Series({table_schema["pk"]: ms}), collapsed_source_membership]
343
+ )
344
+ for ms in membership_string.split("_")
345
+ ]
346
+ )
347
+
348
+
349
+ def _collapse_source_df(source_df: pd.DataFrame) -> pd.Series:
350
+ """Collapse a source_df table into a single entry."""
351
+
352
+ if isinstance(source_df, pd.DataFrame):
353
+ collapsed_source_series = pd.Series(
354
+ {
355
+ SOURCE_SPEC.PATHWAY_ID: " OR ".join(source_df[SOURCE_SPEC.PATHWAY_ID]),
356
+ SOURCE_SPEC.MODEL: " OR ".join(source_df[SOURCE_SPEC.MODEL]),
357
+ SOURCE_SPEC.SOURCE: " OR ".join(
358
+ set(source_df[SOURCE_SPEC.SOURCE].tolist())
359
+ ),
360
+ SOURCE_SPEC.SPECIES: " OR ".join(
361
+ set(source_df[SOURCE_SPEC.SPECIES].tolist())
362
+ ),
363
+ SOURCE_SPEC.NAME: " OR ".join(source_df[SOURCE_SPEC.NAME]),
364
+ SOURCE_SPEC.N_COLLAPSED_PATHWAYS: source_df.shape[0],
365
+ }
366
+ )
367
+ elif isinstance(source_df, pd.Series):
368
+ collapsed_source_series = pd.Series(
369
+ {
370
+ SOURCE_SPEC.PATHWAY_ID: source_df[SOURCE_SPEC.PATHWAY_ID],
371
+ SOURCE_SPEC.MODEL: source_df[SOURCE_SPEC.MODEL],
372
+ SOURCE_SPEC.SOURCE: source_df[SOURCE_SPEC.SOURCE],
373
+ SOURCE_SPEC.SPECIES: source_df[SOURCE_SPEC.SPECIES],
374
+ SOURCE_SPEC.NAME: source_df[SOURCE_SPEC.NAME],
375
+ SOURCE_SPEC.N_COLLAPSED_PATHWAYS: 1,
376
+ }
377
+ )
378
+ else:
379
+ raise TypeError(
380
+ f"source_df must be a pd.DataFrame or pd.Series, but was a {type(source_df).__name__}"
381
+ )
382
+
383
+ return collapsed_source_series
384
+
385
+
386
+ def _safe_source_merge(member_Sources: Source | list) -> Source:
387
+ """Combine either a Source or pd.Series of Sources into a single Source object."""
388
+
389
+ if isinstance(member_Sources, Source):
390
+ return member_Sources
391
+ elif isinstance(member_Sources, pd.Series):
392
+ return merge_sources(member_Sources.tolist())
393
+ else:
394
+ raise TypeError("Expecting source.Source or pd.Series")