napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
@@ -0,0 +1,304 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import copy
|
4
|
+
|
5
|
+
import logging
|
6
|
+
import re
|
7
|
+
from typing import Any
|
8
|
+
from typing import Iterable
|
9
|
+
from fs import open_fs
|
10
|
+
|
11
|
+
import numpy as np
|
12
|
+
import pandas as pd
|
13
|
+
from napistu import utils
|
14
|
+
from napistu import indices
|
15
|
+
|
16
|
+
from napistu.constants import SBML_DFS
|
17
|
+
from napistu.constants import IDENTIFIERS
|
18
|
+
from napistu.constants import BQB_DEFINING_ATTRS
|
19
|
+
from napistu.constants import BQB_DEFINING_ATTRS_LOOSE
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
|
25
|
+
"""
|
26
|
+
Unnest Identifiers
|
27
|
+
|
28
|
+
Take a pd.DataFrame containing an array of Identifiers and
|
29
|
+
return one-row per identifier.
|
30
|
+
|
31
|
+
Parameters:
|
32
|
+
id_table: pd.DataFrame
|
33
|
+
a table containing an array of Identifiers
|
34
|
+
id_var: str
|
35
|
+
variable containing Identifiers
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
pd.Dataframe containing the index of id_table but expanded
|
39
|
+
to include one row per identifier
|
40
|
+
|
41
|
+
"""
|
42
|
+
|
43
|
+
# validate inputs
|
44
|
+
utils.match_pd_vars(id_table, {id_var}).assert_present()
|
45
|
+
|
46
|
+
N_invalid_ids = sum(id_table[id_var].isna())
|
47
|
+
if N_invalid_ids != 0:
|
48
|
+
raise ValueError(
|
49
|
+
f'{N_invalid_ids} entries in "id_table" were missing',
|
50
|
+
"entries with no identifiers should still include an Identifiers object",
|
51
|
+
)
|
52
|
+
|
53
|
+
# Get the identifier as a list of dicts
|
54
|
+
df = id_table[id_var].apply(lambda x: x.ids if len(x.ids) > 0 else 0).to_frame()
|
55
|
+
# Filter out zero length lists
|
56
|
+
df = df.query(f"{id_var} != 0")
|
57
|
+
# Unnest the list of dicts into one dict per row
|
58
|
+
df = df.explode(id_var)
|
59
|
+
# Unnest the dict into a dataframe
|
60
|
+
df = pd.DataFrame(df[id_var].values.tolist(), index=df.index)
|
61
|
+
# Add the entry number as an index
|
62
|
+
df["entry"] = df.groupby(df.index).cumcount()
|
63
|
+
df.set_index("entry", append=True, inplace=True)
|
64
|
+
return df
|
65
|
+
|
66
|
+
|
67
|
+
def id_formatter(id_values: Iterable[Any], id_type: str, id_len: int = 8) -> list[str]:
|
68
|
+
id_prefix = utils.extract_regex_match("^([a-zA-Z]+)_id$", id_type).upper()
|
69
|
+
return [id_prefix + format(x, f"0{id_len}d") for x in id_values]
|
70
|
+
|
71
|
+
|
72
|
+
def id_formatter_inv(ids: list[str]) -> list[int]:
|
73
|
+
"""
|
74
|
+
ID Formatter Inverter
|
75
|
+
|
76
|
+
Convert from internal IDs back to integer IDs
|
77
|
+
"""
|
78
|
+
|
79
|
+
id_val = list()
|
80
|
+
for an_id in ids:
|
81
|
+
if re.match("^[A-Z]+[0-9]+$", an_id):
|
82
|
+
id_val.append(int(re.sub("^[A-Z]+", "", an_id)))
|
83
|
+
else:
|
84
|
+
id_val.append(np.nan) # type: ignore
|
85
|
+
|
86
|
+
return id_val
|
87
|
+
|
88
|
+
|
89
|
+
def get_current_max_id(sbml_dfs_table: pd.DataFrame) -> int:
|
90
|
+
"""
|
91
|
+
Get Current Max ID
|
92
|
+
|
93
|
+
Look at a table from an SBML_dfs object and find the largest primary key following
|
94
|
+
the default naming convention for a the table.
|
95
|
+
|
96
|
+
Params:
|
97
|
+
sbml_dfs_table (pd.DataFrame):
|
98
|
+
A table derived from an SBML_dfs object.
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
current_max_id (int):
|
102
|
+
The largest id which is already defined in the table using its expected naming
|
103
|
+
convention. If no IDs following this convention are present then the default
|
104
|
+
will be -1. In this way new IDs will be added starting with 0.
|
105
|
+
|
106
|
+
"""
|
107
|
+
|
108
|
+
existing_ids_numeric = id_formatter_inv(sbml_dfs_table.index.tolist())
|
109
|
+
|
110
|
+
# filter np.nan which will be introduced if the key is not the default format
|
111
|
+
existing_ids_numeric_valid = [x for x in existing_ids_numeric if x is not np.nan]
|
112
|
+
if len(existing_ids_numeric_valid) == 0:
|
113
|
+
current_max_id = -1
|
114
|
+
else:
|
115
|
+
current_max_id = max(existing_ids_numeric_valid)
|
116
|
+
|
117
|
+
return current_max_id
|
118
|
+
|
119
|
+
|
120
|
+
def adapt_pw_index(
|
121
|
+
source: str | indices.PWIndex,
|
122
|
+
species: str | Iterable[str] | None,
|
123
|
+
outdir: str | None = None,
|
124
|
+
) -> indices.PWIndex:
|
125
|
+
"""Adapts a pw_index
|
126
|
+
|
127
|
+
Helpful to filter for species before reconstructing.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
source (str | PWIndex): uri for pw_index.csv file or PWIndex object
|
131
|
+
species (str):
|
132
|
+
outdir (str | None, optional): Optional directory to write pw_index to.
|
133
|
+
Defaults to None.
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
indices.PWIndex: Filtered pw index
|
137
|
+
"""
|
138
|
+
if isinstance(source, str):
|
139
|
+
pw_index = indices.PWIndex(source)
|
140
|
+
elif isinstance(source, indices.PWIndex):
|
141
|
+
pw_index = copy.deepcopy(source)
|
142
|
+
else:
|
143
|
+
raise ValueError("'source' needs to be str or PWIndex.")
|
144
|
+
pw_index.filter(species=species)
|
145
|
+
|
146
|
+
if outdir is not None:
|
147
|
+
with open_fs(outdir, create=True) as fs:
|
148
|
+
with fs.open("pw_index.tsv", "w") as f:
|
149
|
+
pw_index.index.to_csv(f, sep="\t")
|
150
|
+
return pw_index
|
151
|
+
|
152
|
+
|
153
|
+
def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
|
154
|
+
assert isinstance(dogmatic, bool)
|
155
|
+
if dogmatic:
|
156
|
+
logger.info(
|
157
|
+
"Running in dogmatic mode - differences genes, transcripts, and proteins will "
|
158
|
+
"try to be maintained as separate species."
|
159
|
+
)
|
160
|
+
# preserve differences between genes, transcripts, and proteins
|
161
|
+
defining_biological_qualifiers = BQB_DEFINING_ATTRS
|
162
|
+
else:
|
163
|
+
logger.info(
|
164
|
+
"Running in non-dogmatic mode - genes, transcripts, and proteins will "
|
165
|
+
"be merged if possible."
|
166
|
+
)
|
167
|
+
# merge genes, transcripts, and proteins (if they are defined with
|
168
|
+
# bqb terms which specify their relationships).
|
169
|
+
defining_biological_qualifiers = BQB_DEFINING_ATTRS_LOOSE
|
170
|
+
|
171
|
+
return defining_biological_qualifiers
|
172
|
+
|
173
|
+
|
174
|
+
def match_entitydata_index_to_entity(
|
175
|
+
entity_data_dict: dict,
|
176
|
+
an_entity_data_type: str,
|
177
|
+
consensus_entity_df: pd.DataFrame,
|
178
|
+
entity_schema: dict,
|
179
|
+
table: str,
|
180
|
+
) -> pd.DataFrame:
|
181
|
+
"""
|
182
|
+
Match the index of entity_data_dict[an_entity_data_type] with the index of corresponding entity.
|
183
|
+
Update entity_data_dict[an_entity_data_type]'s index to the same as consensus_entity_df's index
|
184
|
+
Report cases where entity_data has indices not in corresponding entity's index.
|
185
|
+
Args
|
186
|
+
entity_data_dict (dict): dictionary containing all model's "an_entity_data_type" dictionaries
|
187
|
+
an_entity_data_type (str): data_type from species/reactions_data in entity_data_dict
|
188
|
+
consensus_entity_df (pd.DataFrame): the dataframe of the corresponding entity
|
189
|
+
entity_schema (dict): schema for "table"
|
190
|
+
table (str): table whose data is being consolidates (currently species or reactions)
|
191
|
+
Returns:
|
192
|
+
entity_data_df (pd.DataFrame) table for entity_data_dict[an_entity_data_type]
|
193
|
+
"""
|
194
|
+
|
195
|
+
data_table = table + "_data"
|
196
|
+
entity_data_df = entity_data_dict[an_entity_data_type]
|
197
|
+
|
198
|
+
# ensure entity_data_df[an_entity_data_type]'s index doesn't have
|
199
|
+
# reaction ids that are not in consensus_entity's index
|
200
|
+
if len(entity_data_df.index.difference(consensus_entity_df.index)) == 0:
|
201
|
+
logger.info(f"{data_table} ids are included in {table} ids")
|
202
|
+
else:
|
203
|
+
logger.warnning(
|
204
|
+
f"{data_table} have ids are not matched to {table} ids,"
|
205
|
+
f"please check mismatched ids first"
|
206
|
+
)
|
207
|
+
|
208
|
+
# when entity_data_df is only a subset of the index of consensus_entity_df
|
209
|
+
# add ids only in consensus_entity_df to entity_data_df, and fill values with Nan
|
210
|
+
if len(entity_data_df) != len(consensus_entity_df):
|
211
|
+
logger.info(
|
212
|
+
f"The {data_table} has {len(entity_data_df)} ids,"
|
213
|
+
f"different from {len(consensus_entity_df)} ids in the {table} table,"
|
214
|
+
f"updating {data_table} ids."
|
215
|
+
)
|
216
|
+
|
217
|
+
entity_data_df = pd.concat(
|
218
|
+
[
|
219
|
+
entity_data_df,
|
220
|
+
consensus_entity_df[
|
221
|
+
~consensus_entity_df.index.isin(entity_data_df.index)
|
222
|
+
],
|
223
|
+
],
|
224
|
+
ignore_index=False,
|
225
|
+
)
|
226
|
+
|
227
|
+
entity_data_df.drop(entity_schema["vars"], axis=1, inplace=True)
|
228
|
+
|
229
|
+
return entity_data_df
|
230
|
+
|
231
|
+
|
232
|
+
def check_entity_data_index_matching(sbml_dfs, table):
|
233
|
+
"""
|
234
|
+
Update the input smbl_dfs's entity_data (dict) index
|
235
|
+
with match_entitydata_index_to_entity,
|
236
|
+
so that index for dataframe(s) in entity_data (dict) matches the sbml_dfs'
|
237
|
+
corresponding entity, and then passes sbml_dfs.validate()
|
238
|
+
Args
|
239
|
+
sbml_dfs (cpr.SBML_dfs): a cpr.SBML_dfs
|
240
|
+
table (str): table whose data is being consolidates (currently species or reactions)
|
241
|
+
Returns
|
242
|
+
sbml_dfs (cpr.SBML_dfs):
|
243
|
+
sbml_dfs whose entity_data is checked to have the same index
|
244
|
+
as the corresponding entity.
|
245
|
+
"""
|
246
|
+
|
247
|
+
table_data = table + "_data"
|
248
|
+
|
249
|
+
entity_data_dict = getattr(sbml_dfs, table_data)
|
250
|
+
entity_schema = sbml_dfs.schema[table]
|
251
|
+
sbml_dfs_entity = getattr(sbml_dfs, table)
|
252
|
+
|
253
|
+
if entity_data_dict != {}:
|
254
|
+
entity_data_types = set.union(set(entity_data_dict.keys()))
|
255
|
+
|
256
|
+
entity_data_dict_checked = {
|
257
|
+
x: match_entitydata_index_to_entity(
|
258
|
+
entity_data_dict, x, sbml_dfs_entity, entity_schema, table
|
259
|
+
)
|
260
|
+
for x in entity_data_types
|
261
|
+
}
|
262
|
+
|
263
|
+
if table == SBML_DFS.REACTIONS:
|
264
|
+
sbml_dfs.reactions_data = entity_data_dict_checked
|
265
|
+
elif table == SBML_DFS.SPECIES:
|
266
|
+
sbml_dfs.species_data = entity_data_dict_checked
|
267
|
+
|
268
|
+
return sbml_dfs
|
269
|
+
|
270
|
+
|
271
|
+
def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
|
272
|
+
assert isinstance(dogmatic, bool)
|
273
|
+
if dogmatic:
|
274
|
+
logger.info(
|
275
|
+
"Running in dogmatic mode - differences genes, transcripts, and proteins will "
|
276
|
+
"try to be maintained as separate species."
|
277
|
+
)
|
278
|
+
# preserve differences between genes, transcripts, and proteins
|
279
|
+
defining_biological_qualifiers = BQB_DEFINING_ATTRS
|
280
|
+
else:
|
281
|
+
logger.info(
|
282
|
+
"Running in non-dogmatic mode - genes, transcripts, and proteins will "
|
283
|
+
"be merged if possible."
|
284
|
+
)
|
285
|
+
# merge genes, transcripts, and proteins (if they are defined with
|
286
|
+
# bqb terms which specify their relationships).
|
287
|
+
defining_biological_qualifiers = BQB_DEFINING_ATTRS_LOOSE
|
288
|
+
|
289
|
+
return defining_biological_qualifiers
|
290
|
+
|
291
|
+
|
292
|
+
def _stub_ids(ids):
|
293
|
+
"""Stub with a blank ID if an ids list is blank; otherwise create an Identifiers object from the provided ids"""
|
294
|
+
if len(ids) == 0:
|
295
|
+
return pd.DataFrame(
|
296
|
+
{
|
297
|
+
IDENTIFIERS.ONTOLOGY: [None],
|
298
|
+
IDENTIFIERS.IDENTIFIER: [None],
|
299
|
+
IDENTIFIERS.URL: [None],
|
300
|
+
IDENTIFIERS.BQB: [None],
|
301
|
+
}
|
302
|
+
)
|
303
|
+
else:
|
304
|
+
return pd.DataFrame(ids)
|
napistu/source.py
ADDED
@@ -0,0 +1,394 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
from napistu import indices
|
5
|
+
from napistu.constants import SOURCE_SPEC
|
6
|
+
|
7
|
+
|
8
|
+
class Source:
|
9
|
+
"""
|
10
|
+
An Entity's Source
|
11
|
+
|
12
|
+
Attributes
|
13
|
+
----------
|
14
|
+
source : pd.DataFrame
|
15
|
+
A dataframe containing the model source and other optional variables
|
16
|
+
|
17
|
+
Methods
|
18
|
+
-------
|
19
|
+
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
source_df: pd.DataFrame | None = None,
|
25
|
+
init: bool = False,
|
26
|
+
pw_index: indices.PWIndex | None = None,
|
27
|
+
) -> None:
|
28
|
+
"""
|
29
|
+
Tracks the model(s) an entity (i.e., a compartment, species, reaction) came from.
|
30
|
+
|
31
|
+
By convention sources exist only for the models that an entity came from rather
|
32
|
+
than the current model they are part of. For example, when combining Reactome models
|
33
|
+
into a consensus, a molecule which existed in multiple models would have a source entry
|
34
|
+
for each, but it would not have a source entry for the consensus model itself.
|
35
|
+
|
36
|
+
Parameters
|
37
|
+
----------
|
38
|
+
source_df : pd.DataFrame
|
39
|
+
A dataframe containing the model source and other optional variables
|
40
|
+
init : bool
|
41
|
+
Creates an empty source object. This is typically used when creating an SBML_dfs
|
42
|
+
object from a single source.
|
43
|
+
pw_index : indices.PWIndex
|
44
|
+
|
45
|
+
Returns
|
46
|
+
-------
|
47
|
+
None.
|
48
|
+
|
49
|
+
"""
|
50
|
+
|
51
|
+
if init is True:
|
52
|
+
# initialize with an empty Source
|
53
|
+
self.source = None
|
54
|
+
else:
|
55
|
+
if isinstance(source_df, pd.DataFrame):
|
56
|
+
# if pw_index is provided then it will be joined to source_df to add additional metadata
|
57
|
+
if pw_index is not None:
|
58
|
+
if not isinstance(pw_index, indices.PWIndex):
|
59
|
+
raise ValueError(
|
60
|
+
f"pw_index must be a indices.PWIndex or None and was {type(pw_index).__name__}"
|
61
|
+
)
|
62
|
+
else:
|
63
|
+
# check that all models are present in the pathway index
|
64
|
+
missing_pathways = set(
|
65
|
+
source_df[SOURCE_SPEC.MODEL].tolist()
|
66
|
+
).difference(
|
67
|
+
set(pw_index.index[SOURCE_SPEC.PATHWAY_ID].tolist())
|
68
|
+
)
|
69
|
+
if len(missing_pathways) > 0:
|
70
|
+
raise ValueError(
|
71
|
+
f"{len(missing_pathways)} pathway models are present"
|
72
|
+
f" in source_df but not the pw_index: {', '.join(missing_pathways)}"
|
73
|
+
)
|
74
|
+
|
75
|
+
source_df = source_df.merge(
|
76
|
+
pw_index.index,
|
77
|
+
left_on=SOURCE_SPEC.MODEL,
|
78
|
+
right_on=SOURCE_SPEC.PATHWAY_ID,
|
79
|
+
)
|
80
|
+
|
81
|
+
self.source = source_df
|
82
|
+
else:
|
83
|
+
raise TypeError(
|
84
|
+
'source_df must be a pd.DataFrame if "init" is False, but was type '
|
85
|
+
f"{type(source_df).__name__}"
|
86
|
+
)
|
87
|
+
|
88
|
+
if SOURCE_SPEC.MODEL not in source_df.columns.values.tolist():
|
89
|
+
raise ValueError(
|
90
|
+
f"{SOURCE_SPEC.MODEL} variable was not found, but is required in a Source object"
|
91
|
+
)
|
92
|
+
if SOURCE_SPEC.PATHWAY_ID not in source_df.columns.values.tolist():
|
93
|
+
raise ValueError(
|
94
|
+
f"{SOURCE_SPEC.PATHWAY_ID} variable was not found, but is required in a Source object"
|
95
|
+
)
|
96
|
+
|
97
|
+
|
98
|
+
def create_source_table(
|
99
|
+
lookup_table: pd.Series, table_schema: dict, pw_index: indices.PWIndex | None
|
100
|
+
) -> pd.DataFrame:
|
101
|
+
"""
|
102
|
+
Create Source Table
|
103
|
+
|
104
|
+
Create a table with one row per "new_id" and a Source object created from the union
|
105
|
+
of "old_id" Source objects
|
106
|
+
"""
|
107
|
+
|
108
|
+
if SOURCE_SPEC.SOURCE not in table_schema.keys():
|
109
|
+
raise ValueError(
|
110
|
+
f"{SOURCE_SPEC.SOURCE} not present in schema, can't create source_table"
|
111
|
+
)
|
112
|
+
|
113
|
+
# take lookup_table and create an index on "new_id". Multiple rows may have the
|
114
|
+
# same value for new_id so these are grouped together.
|
115
|
+
lookup_table_rearranged = lookup_table.reset_index().set_index(["new_id"])
|
116
|
+
|
117
|
+
# run a list comprehension over each value of new_id to create a Source
|
118
|
+
# object based on the dataframe specific to new_id
|
119
|
+
# pw_index is provided to fill out additional meta-information beyond the
|
120
|
+
# pathway_id which defines a single source
|
121
|
+
def create_source(group):
|
122
|
+
return Source(
|
123
|
+
group.reset_index(drop=True),
|
124
|
+
pw_index=pw_index,
|
125
|
+
)
|
126
|
+
|
127
|
+
id_table = (
|
128
|
+
lookup_table_rearranged.groupby("new_id")
|
129
|
+
.apply(create_source)
|
130
|
+
.rename(table_schema[SOURCE_SPEC.SOURCE])
|
131
|
+
.to_frame()
|
132
|
+
)
|
133
|
+
|
134
|
+
id_table.index = id_table.index.rename(table_schema["pk"])
|
135
|
+
|
136
|
+
return id_table
|
137
|
+
|
138
|
+
|
139
|
+
def merge_sources(source_list: list | pd.Series) -> Source:
|
140
|
+
"""
|
141
|
+
Merge Sources
|
142
|
+
|
143
|
+
Merge a list of Source objects into a single Source object
|
144
|
+
|
145
|
+
"""
|
146
|
+
|
147
|
+
# filter to non-empty sources
|
148
|
+
# empty sources have only been initialized; a merge hasn't occured
|
149
|
+
existing_sources = [s.source is not None for s in source_list]
|
150
|
+
if not any(existing_sources):
|
151
|
+
if isinstance(source_list, list):
|
152
|
+
return source_list[0]
|
153
|
+
else:
|
154
|
+
return source_list.iloc[0]
|
155
|
+
|
156
|
+
existing_source_list = [
|
157
|
+
x.source for x, y in zip(source_list, existing_sources) if y
|
158
|
+
]
|
159
|
+
|
160
|
+
return Source(pd.concat(existing_source_list))
|
161
|
+
|
162
|
+
|
163
|
+
def unnest_sources(
|
164
|
+
source_table: pd.DataFrame, source_var: str, verbose: bool = False
|
165
|
+
) -> pd.DataFrame:
|
166
|
+
"""
|
167
|
+
Unnest Sources
|
168
|
+
|
169
|
+
Take a pd.DataFrame containing an array of Sources and
|
170
|
+
return one-row per source.
|
171
|
+
|
172
|
+
Parameters:
|
173
|
+
source_table: pd.DataFrame
|
174
|
+
a table containing an array of Sources
|
175
|
+
source_var: str
|
176
|
+
variable containing Sources
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
pd.Dataframe containing the index of source_table but expanded
|
180
|
+
to include one row per source
|
181
|
+
|
182
|
+
"""
|
183
|
+
|
184
|
+
sources = list()
|
185
|
+
source_table_index = source_table.index.to_frame().reset_index(drop=True)
|
186
|
+
|
187
|
+
for i in range(source_table.shape[0]):
|
188
|
+
if verbose:
|
189
|
+
print(f"Processing {source_table_index.index.values[i]}")
|
190
|
+
|
191
|
+
# check that the entries of sourcevar are Source objects
|
192
|
+
source_value = source_table[source_var].iloc[i]
|
193
|
+
|
194
|
+
assert isinstance(source_value, Source)
|
195
|
+
|
196
|
+
if source_value.source is None:
|
197
|
+
print("Some sources were only missing - returning None")
|
198
|
+
return None
|
199
|
+
|
200
|
+
source_tbl = pd.DataFrame(source_value.source)
|
201
|
+
source_tbl.index.name = SOURCE_SPEC.INDEX_NAME
|
202
|
+
source_tbl = source_tbl.reset_index()
|
203
|
+
|
204
|
+
# add original index as variables and then set index
|
205
|
+
for j in range(source_table_index.shape[1]):
|
206
|
+
source_tbl[source_table_index.columns[j]] = source_table_index.iloc[i, j]
|
207
|
+
source_tbl = source_tbl.set_index(
|
208
|
+
list(source_table_index.columns) + [SOURCE_SPEC.INDEX_NAME]
|
209
|
+
)
|
210
|
+
|
211
|
+
sources.append(source_tbl)
|
212
|
+
|
213
|
+
return pd.concat(sources)
|
214
|
+
|
215
|
+
|
216
|
+
def greedy_set_coverge_of_sources(
|
217
|
+
source_df: pd.DataFrame, table_schema: dict
|
218
|
+
) -> pd.DataFrame:
|
219
|
+
"""
|
220
|
+
Greedy Set Coverage of Sources
|
221
|
+
|
222
|
+
Apply the greedy set coverge algorithm to find the minimal set of
|
223
|
+
sources which cover all entries
|
224
|
+
|
225
|
+
Parameters:
|
226
|
+
source_df: pd.DataFrame
|
227
|
+
pd.Dataframe containing the index of source_table but expanded to
|
228
|
+
include one row per source. As produced by source.unnest_sources()
|
229
|
+
|
230
|
+
Returns:
|
231
|
+
minimial_sources: [str]
|
232
|
+
A list of pathway_ids of the minimal source set
|
233
|
+
|
234
|
+
"""
|
235
|
+
|
236
|
+
# rollup pathways with identical membership
|
237
|
+
deduplicated_sources = _deduplicate_source_df(source_df, table_schema)
|
238
|
+
|
239
|
+
unaccounted_for_members = deduplicated_sources
|
240
|
+
retained_pathway_ids = []
|
241
|
+
|
242
|
+
while unaccounted_for_members.shape[0] != 0:
|
243
|
+
# find the pathway with the most members
|
244
|
+
pathway_members = unaccounted_for_members.groupby(SOURCE_SPEC.PATHWAY_ID).size()
|
245
|
+
top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
|
246
|
+
retained_pathway_ids.append(top_pathway)
|
247
|
+
|
248
|
+
# remove all members associated with the top pathway
|
249
|
+
members_captured = (
|
250
|
+
unaccounted_for_members[
|
251
|
+
unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
|
252
|
+
]
|
253
|
+
.index.get_level_values(table_schema["pk"])
|
254
|
+
.tolist()
|
255
|
+
)
|
256
|
+
|
257
|
+
unaccounted_for_members = unaccounted_for_members[
|
258
|
+
~unaccounted_for_members.index.get_level_values(table_schema["pk"]).isin(
|
259
|
+
members_captured
|
260
|
+
)
|
261
|
+
]
|
262
|
+
|
263
|
+
minimial_sources = deduplicated_sources[
|
264
|
+
deduplicated_sources[SOURCE_SPEC.PATHWAY_ID].isin(retained_pathway_ids)
|
265
|
+
].sort_index()
|
266
|
+
|
267
|
+
return minimial_sources
|
268
|
+
|
269
|
+
|
270
|
+
def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.DataFrame:
|
271
|
+
"""Combine entries in a source table when multiple models have the same members."""
|
272
|
+
|
273
|
+
# drop entries which are missing required attributes and throw an error if none are left
|
274
|
+
REQUIRED_NON_NA_ATTRIBUTES = [SOURCE_SPEC.PATHWAY_ID]
|
275
|
+
indexed_sources = (
|
276
|
+
source_df.reset_index()
|
277
|
+
.merge(source_df[REQUIRED_NON_NA_ATTRIBUTES].dropna())
|
278
|
+
.set_index(SOURCE_SPEC.PATHWAY_ID)
|
279
|
+
)
|
280
|
+
|
281
|
+
if indexed_sources.shape[0] == 0:
|
282
|
+
raise ValueError(
|
283
|
+
f"source_df was provided but zero entries had a defined {' OR '.join(REQUIRED_NON_NA_ATTRIBUTES)}"
|
284
|
+
)
|
285
|
+
|
286
|
+
pathways = indexed_sources.index.unique()
|
287
|
+
|
288
|
+
# identify pathways with identical coverage
|
289
|
+
|
290
|
+
pathway_member_string = (
|
291
|
+
pd.DataFrame(
|
292
|
+
[
|
293
|
+
{
|
294
|
+
SOURCE_SPEC.PATHWAY_ID: p,
|
295
|
+
"membership_string": "_".join(
|
296
|
+
set(indexed_sources.loc[[p]][table_schema["pk"]].tolist())
|
297
|
+
),
|
298
|
+
}
|
299
|
+
for p in pathways
|
300
|
+
]
|
301
|
+
)
|
302
|
+
.drop_duplicates()
|
303
|
+
.set_index("membership_string")
|
304
|
+
)
|
305
|
+
|
306
|
+
membership_categories = pathway_member_string.merge(
|
307
|
+
source_df.groupby(SOURCE_SPEC.PATHWAY_ID).first(),
|
308
|
+
left_on=SOURCE_SPEC.PATHWAY_ID,
|
309
|
+
right_index=True,
|
310
|
+
)
|
311
|
+
|
312
|
+
category_index = membership_categories.index.unique()
|
313
|
+
assert isinstance(category_index, pd.core.indexes.base.Index)
|
314
|
+
|
315
|
+
merged_sources = pd.concat(
|
316
|
+
[
|
317
|
+
_collapse_by_membership_string(s, membership_categories, table_schema) # type: ignore
|
318
|
+
for s in category_index.tolist()
|
319
|
+
]
|
320
|
+
)
|
321
|
+
merged_sources[SOURCE_SPEC.INDEX_NAME] = merged_sources.groupby(
|
322
|
+
table_schema["pk"]
|
323
|
+
).cumcount()
|
324
|
+
|
325
|
+
return merged_sources.set_index(
|
326
|
+
[table_schema["pk"], SOURCE_SPEC.INDEX_NAME]
|
327
|
+
).sort_index()
|
328
|
+
|
329
|
+
|
330
|
+
def _collapse_by_membership_string(
|
331
|
+
membership_string: str, membership_categories: pd.DataFrame, table_schema: dict
|
332
|
+
) -> pd.DataFrame:
|
333
|
+
"""Assign each member of a membership-string to a set of pathways."""
|
334
|
+
|
335
|
+
collapsed_source_membership = _collapse_source_df(
|
336
|
+
membership_categories.loc[membership_string]
|
337
|
+
)
|
338
|
+
|
339
|
+
return pd.DataFrame(
|
340
|
+
[
|
341
|
+
pd.concat(
|
342
|
+
[pd.Series({table_schema["pk"]: ms}), collapsed_source_membership]
|
343
|
+
)
|
344
|
+
for ms in membership_string.split("_")
|
345
|
+
]
|
346
|
+
)
|
347
|
+
|
348
|
+
|
349
|
+
def _collapse_source_df(source_df: pd.DataFrame) -> pd.Series:
|
350
|
+
"""Collapse a source_df table into a single entry."""
|
351
|
+
|
352
|
+
if isinstance(source_df, pd.DataFrame):
|
353
|
+
collapsed_source_series = pd.Series(
|
354
|
+
{
|
355
|
+
SOURCE_SPEC.PATHWAY_ID: " OR ".join(source_df[SOURCE_SPEC.PATHWAY_ID]),
|
356
|
+
SOURCE_SPEC.MODEL: " OR ".join(source_df[SOURCE_SPEC.MODEL]),
|
357
|
+
SOURCE_SPEC.SOURCE: " OR ".join(
|
358
|
+
set(source_df[SOURCE_SPEC.SOURCE].tolist())
|
359
|
+
),
|
360
|
+
SOURCE_SPEC.SPECIES: " OR ".join(
|
361
|
+
set(source_df[SOURCE_SPEC.SPECIES].tolist())
|
362
|
+
),
|
363
|
+
SOURCE_SPEC.NAME: " OR ".join(source_df[SOURCE_SPEC.NAME]),
|
364
|
+
SOURCE_SPEC.N_COLLAPSED_PATHWAYS: source_df.shape[0],
|
365
|
+
}
|
366
|
+
)
|
367
|
+
elif isinstance(source_df, pd.Series):
|
368
|
+
collapsed_source_series = pd.Series(
|
369
|
+
{
|
370
|
+
SOURCE_SPEC.PATHWAY_ID: source_df[SOURCE_SPEC.PATHWAY_ID],
|
371
|
+
SOURCE_SPEC.MODEL: source_df[SOURCE_SPEC.MODEL],
|
372
|
+
SOURCE_SPEC.SOURCE: source_df[SOURCE_SPEC.SOURCE],
|
373
|
+
SOURCE_SPEC.SPECIES: source_df[SOURCE_SPEC.SPECIES],
|
374
|
+
SOURCE_SPEC.NAME: source_df[SOURCE_SPEC.NAME],
|
375
|
+
SOURCE_SPEC.N_COLLAPSED_PATHWAYS: 1,
|
376
|
+
}
|
377
|
+
)
|
378
|
+
else:
|
379
|
+
raise TypeError(
|
380
|
+
f"source_df must be a pd.DataFrame or pd.Series, but was a {type(source_df).__name__}"
|
381
|
+
)
|
382
|
+
|
383
|
+
return collapsed_source_series
|
384
|
+
|
385
|
+
|
386
|
+
def _safe_source_merge(member_Sources: Source | list) -> Source:
|
387
|
+
"""Combine either a Source or pd.Series of Sources into a single Source object."""
|
388
|
+
|
389
|
+
if isinstance(member_Sources, Source):
|
390
|
+
return member_Sources
|
391
|
+
elif isinstance(member_Sources, pd.Series):
|
392
|
+
return merge_sources(member_Sources.tolist())
|
393
|
+
else:
|
394
|
+
raise TypeError("Expecting source.Source or pd.Series")
|