fpg-observational-model 1.0.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fpg_observational_model/__init__.py +48 -0
- fpg_observational_model/plotting_code/__init__.py +0 -0
- fpg_observational_model/plotting_code/observational_extracted.py +261 -0
- fpg_observational_model/run_observational_model.py +529 -0
- fpg_observational_model/unified_metric_calculations.py +1019 -0
- fpg_observational_model/unified_sampling.py +751 -0
- fpg_observational_model/vm_parallelization.py +296 -0
- fpg_observational_model-1.0.1.dev1.dist-info/METADATA +249 -0
- fpg_observational_model-1.0.1.dev1.dist-info/RECORD +12 -0
- fpg_observational_model-1.0.1.dev1.dist-info/WHEEL +5 -0
- fpg_observational_model-1.0.1.dev1.dist-info/licenses/LICENSE.txt +21 -0
- fpg_observational_model-1.0.1.dev1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""FPG Observational Model - Genomic surveillance sampling for EMOD simulations.
|
|
2
|
+
|
|
3
|
+
This package provides tools for converting EMOD's Full Parasite Genetics (FPG)
|
|
4
|
+
simulation output into recapitulative sampling for genomic surveillance analysis.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Import main functions from modules
|
|
8
|
+
from .run_observational_model import (
|
|
9
|
+
run_observational_model,
|
|
10
|
+
get_default_config,
|
|
11
|
+
update_matrix_indices,
|
|
12
|
+
extract_sampled_infections,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from .unified_sampling import (
|
|
16
|
+
run_sampling_model,
|
|
17
|
+
subset_randomly,
|
|
18
|
+
subset_by_seasons,
|
|
19
|
+
subset_by_age,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from .unified_metric_calculations import (
|
|
23
|
+
register_matrix,
|
|
24
|
+
get_matrix,
|
|
25
|
+
run_time_summaries,
|
|
26
|
+
comprehensive_group_summary,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Define public API
|
|
30
|
+
__all__ = [
|
|
31
|
+
# Main workflow
|
|
32
|
+
"run_observational_model",
|
|
33
|
+
"get_default_config",
|
|
34
|
+
"update_matrix_indices",
|
|
35
|
+
"extract_sampled_infections",
|
|
36
|
+
|
|
37
|
+
# Sampling functions
|
|
38
|
+
"run_sampling_model",
|
|
39
|
+
"subset_randomly",
|
|
40
|
+
"subset_by_seasons",
|
|
41
|
+
"subset_by_age",
|
|
42
|
+
|
|
43
|
+
# Metric calculations
|
|
44
|
+
"register_matrix",
|
|
45
|
+
"get_matrix",
|
|
46
|
+
"run_time_summaries",
|
|
47
|
+
"comprehensive_group_summary",
|
|
48
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import ast
|
|
4
|
+
import math
|
|
5
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
6
|
+
|
|
7
|
+
def parse_list(s):
|
|
8
|
+
"""
|
|
9
|
+
Converts a string representation of a list into an actual Python list.
|
|
10
|
+
If conversion fails, returns an empty list.
|
|
11
|
+
"""
|
|
12
|
+
try:
|
|
13
|
+
return ast.literal_eval(s)
|
|
14
|
+
except Exception:
|
|
15
|
+
return []
|
|
16
|
+
|
|
17
|
+
def assign_season_group(row):
|
|
18
|
+
year = int(row['year'])
|
|
19
|
+
month = int(row['month'])
|
|
20
|
+
if month == 1:
|
|
21
|
+
# For January, group with the previous year's August-December period.
|
|
22
|
+
return f"Wet season: {year-1}-08 to {year}-01"
|
|
23
|
+
elif month >= 8:
|
|
24
|
+
# For months Aug-Dec, group with January of the following year.
|
|
25
|
+
return f"Wet season: {year}-08 to {year+1}-01"
|
|
26
|
+
else:
|
|
27
|
+
# For other months (e.g., February to July), you could leave it ungrouped.
|
|
28
|
+
return f"Dry season: {year}-02 to {year}-07"
|
|
29
|
+
|
|
30
|
+
def assign_peak_group(row):
|
|
31
|
+
year = int(row['year'])
|
|
32
|
+
month = int(row['month'])
|
|
33
|
+
if month >= 10 and month <= 12:
|
|
34
|
+
return f"Wet season: {year}-10 to {year}-12"
|
|
35
|
+
elif 3 <= month <= 6:
|
|
36
|
+
return f"Dry season: {year}-03 to {year}-06"
|
|
37
|
+
else:
|
|
38
|
+
return f"Off-peak season"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def reassign_by_intervention(df, intervention_reset=17):
|
|
42
|
+
df = df.rename(columns={'year': 'sim_year'})
|
|
43
|
+
|
|
44
|
+
df['intervention_month'] = df['continuous_month'] - intervention_reset
|
|
45
|
+
df['year'] = df['intervention_month'] // 12
|
|
46
|
+
return(df)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
input_seed=418
|
|
50
|
+
def process_genetic_data(df):
|
|
51
|
+
"""
|
|
52
|
+
Processes the DataFrame by:
|
|
53
|
+
1. Parsing the 'genome_ids' column and computing:
|
|
54
|
+
- true_coi: the total count of items in genome_ids.
|
|
55
|
+
- effective_coi: the count of unique items in genome_ids.
|
|
56
|
+
2. Parsing the 'bite_ids' column and computing:
|
|
57
|
+
- cotransmission: a Boolean indicating if all items in bite_ids are unique.
|
|
58
|
+
|
|
59
|
+
Parameters:
|
|
60
|
+
df (pd.DataFrame): Input DataFrame containing at least 'genome_ids' and 'bite_ids' columns as strings.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
pd.DataFrame: The modified DataFrame with additional computed columns.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Subsample one represenative infection per person per year
|
|
67
|
+
df = df.groupby(['IndividualID', 'year']).sample(n=1, random_state=input_seed)
|
|
68
|
+
df['month'] = df['month'] + 1
|
|
69
|
+
|
|
70
|
+
# 1. Process the 'genome_ids' column
|
|
71
|
+
df["genome_ids"] = df["genome_ids"].apply(parse_list)
|
|
72
|
+
df["true_coi"] = df["genome_ids"].apply(len)
|
|
73
|
+
df["effective_coi"] = df["genome_ids"].apply(lambda x: len(set(x)))
|
|
74
|
+
|
|
75
|
+
# 2. Process the 'bite_ids' column
|
|
76
|
+
df["bite_ids"] = df["bite_ids"].apply(parse_list)
|
|
77
|
+
df["superinfection"] = df["bite_ids"].apply(lambda x: len(set(x)) > 1)
|
|
78
|
+
df["cotransmission"] = df["bite_ids"].apply(lambda x: len(set(x)) == 1)
|
|
79
|
+
|
|
80
|
+
df["season"] = df.apply(assign_season_group, axis=1)
|
|
81
|
+
df["peak_season"] = df.apply(assign_peak_group, axis=1)
|
|
82
|
+
|
|
83
|
+
return df
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def summarize_infections(df, groupby_cols=['year', 'month'],
|
|
87
|
+
sample_n=None, sample_proportionally=True, sample_seasons=False,
|
|
88
|
+
seed=input_seed):
|
|
89
|
+
"""
|
|
90
|
+
Group and summarize infection data by a given time frame.
|
|
91
|
+
|
|
92
|
+
For each group (e.g., by year and month), computes:
|
|
93
|
+
1. Total rows, count and proportion of rows with effective_coi > 1.
|
|
94
|
+
2. Total and unique counts (and the proportion) of genome_ids
|
|
95
|
+
(after flattening the lists from all rows).
|
|
96
|
+
3. Count and proportion for rows where cotransmission and superinfection is True.
|
|
97
|
+
|
|
98
|
+
Optionally, a random sample of rows can be taken before grouping.
|
|
99
|
+
|
|
100
|
+
Parameters:
|
|
101
|
+
df (pd.DataFrame): Input DataFrame. Must contain columns:
|
|
102
|
+
- 'effective_coi' (numeric, computed beforehand)
|
|
103
|
+
- 'genome_ids' (list-like; if not, parse first with parse_list)
|
|
104
|
+
- 'cotransmission' (Boolean)
|
|
105
|
+
- Time frame columns (default: 'year' and 'month')
|
|
106
|
+
groupby_cols (list): List of column names to group by. Defaults to ['year', 'month'].
|
|
107
|
+
sample_n (int or None): If provided, randomly sample n rows from the DataFrame before summarizing.
|
|
108
|
+
seed (int, optional): Random seed for sampling.
|
|
109
|
+
|
|
110
|
+
"""
|
|
111
|
+
# Optionally, subsample the DataFrame.
|
|
112
|
+
if sample_n is not None:
|
|
113
|
+
if sample_proportionally:
|
|
114
|
+
if not sample_seasons:
|
|
115
|
+
# Group by 'year'; each group will sample up to sample_n rows, but if the group
|
|
116
|
+
# has fewer than sample_n rows, it will take all available rows.
|
|
117
|
+
df = df.groupby(['year'], group_keys=False).apply(lambda grp: grp.sample(n=min(len(grp), sample_n), random_state=seed))
|
|
118
|
+
else:
|
|
119
|
+
sample_season = math.floor(sample_n/2)
|
|
120
|
+
df = df.groupby(['season'], group_keys=False).apply(lambda grp: grp.sample(n=min(len(grp), sample_season), random_state=input_seed))
|
|
121
|
+
else:
|
|
122
|
+
# Group by both 'year' and 'month'
|
|
123
|
+
sample_monthly = math.floor(sample_n/12)
|
|
124
|
+
df = df.groupby(['year', 'month'], group_keys=False).apply(lambda grp: grp.sample(n=min(len(grp), sample_monthly), random_state=input_seed))
|
|
125
|
+
|
|
126
|
+
def group_summary(group):
|
|
127
|
+
n = len(group)
|
|
128
|
+
# Count poly_coi: rows with effective_coi > 1.
|
|
129
|
+
poly_count = (group['effective_coi'] > 1).sum()
|
|
130
|
+
poly_prop = poly_count / n if n > 0 else None
|
|
131
|
+
|
|
132
|
+
# effective coi mean
|
|
133
|
+
effective_coi_mean = group['effective_coi'].mean()
|
|
134
|
+
|
|
135
|
+
# Flatten the genome_ids lists from all rows in the group.
|
|
136
|
+
all_genome_ids = [gid for sublist in group['genome_ids']
|
|
137
|
+
if isinstance(sublist, list)
|
|
138
|
+
for gid in sublist]
|
|
139
|
+
total_genome = len(all_genome_ids)
|
|
140
|
+
unique_genome = len(set(all_genome_ids))
|
|
141
|
+
unique_prop = unique_genome / total_genome if total_genome > 0 else None
|
|
142
|
+
|
|
143
|
+
# Count cotransmission True rows.
|
|
144
|
+
polygenomic = group[group['effective_coi'] > 1]
|
|
145
|
+
cotrans_count = polygenomic['cotransmission'].sum() # Assuming boolean where True==1, False==0
|
|
146
|
+
cotrans_prop = cotrans_count / poly_count if poly_count > 0 else None
|
|
147
|
+
|
|
148
|
+
supertrans_count = polygenomic['superinfection'].sum()
|
|
149
|
+
supertrans_prop = supertrans_count / poly_count if poly_count > 0 else None
|
|
150
|
+
|
|
151
|
+
return pd.Series({
|
|
152
|
+
'n_infections': n,
|
|
153
|
+
'poly_coi_count': poly_count,
|
|
154
|
+
'poly_coi_prop': poly_prop,
|
|
155
|
+
'coi_mean': effective_coi_mean,
|
|
156
|
+
'genome_ids_total_count': total_genome,
|
|
157
|
+
'genome_ids_unique_count': unique_genome,
|
|
158
|
+
'genome_ids_unique_prop': unique_prop,
|
|
159
|
+
'cotransmission_count': cotrans_count,
|
|
160
|
+
'cotransmission_prop': cotrans_prop,
|
|
161
|
+
'superinfection_count': supertrans_count,
|
|
162
|
+
'superinfection_prop': supertrans_prop
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
summary_df = df.groupby(groupby_cols).apply(group_summary).reset_index()
|
|
166
|
+
return summary_df
|
|
167
|
+
|
|
168
|
+
def combined_summaries(df):
|
|
169
|
+
"""
|
|
170
|
+
Generate combined summaries of infection data using different sampling schemes.
|
|
171
|
+
|
|
172
|
+
Parameters:
|
|
173
|
+
df (pd.DataFrame): Input DataFrame. Must contain columns:
|
|
174
|
+
- 'effective_coi' (numeric, computed beforehand)
|
|
175
|
+
- 'genome_ids' (list-like; if not, parse first with parse_list)
|
|
176
|
+
- 'cotransmission' (Boolean)
|
|
177
|
+
- Time frame columns (default: 'year' and 'month')
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
pd.DataFrame: A combined summary DataFrame with different sampling schemes.
|
|
181
|
+
"""
|
|
182
|
+
# Summarize using different sampling schemes
|
|
183
|
+
input_n_sample = 100
|
|
184
|
+
summary_all_yearly=summarize_infections(df, groupby_cols=['year']).assign(sampling_scheme='All - Yearly')
|
|
185
|
+
summary_all_monthly=summarize_infections(df, groupby_cols=['year', 'month', 'continuous_month']).assign(sampling_scheme='All - Monthly')
|
|
186
|
+
summary_yearly_proportionally = summarize_infections(df, groupby_cols=['year'], sample_n=input_n_sample).assign(sampling_scheme='Sample - Proportional')
|
|
187
|
+
summary_yearly_evenly = summarize_infections(df, groupby_cols=['year'], sample_n=input_n_sample, sample_proportionally=False).assign(sampling_scheme='Sample - Even')
|
|
188
|
+
summary_yearly_seasonally = summarize_infections(df, groupby_cols=['season'], sample_n=input_n_sample).assign(sampling_scheme='Sample - Seasonal')
|
|
189
|
+
summary_yearly_peaks = summarize_infections(df, groupby_cols=['peak_season'], sample_n=input_n_sample).assign(sampling_scheme='Sample - Peak Seasonal')
|
|
190
|
+
|
|
191
|
+
combined_df = pd.concat([summary_all_yearly, summary_all_monthly, \
|
|
192
|
+
summary_yearly_proportionally, summary_yearly_evenly, \
|
|
193
|
+
summary_yearly_seasonally, summary_yearly_peaks],
|
|
194
|
+
axis=0)
|
|
195
|
+
|
|
196
|
+
return(combined_df)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def process_file(row, output_summary_dir, reassign_intervention_time=True):
|
|
200
|
+
"""Process a single file and write the summary output.
|
|
201
|
+
|
|
202
|
+
Parameters:
|
|
203
|
+
row (pd.Series): A row from the file list DataFrame.
|
|
204
|
+
output_summary_dir (str): Folder where output files will be saved.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
str: The path to the written summary file.
|
|
208
|
+
"""
|
|
209
|
+
output_name = row['output_name']
|
|
210
|
+
# Construct the full path to the input file.
|
|
211
|
+
input_file = os.path.join(row['input_dir'], "infIndexRecursive-genomes-df.csv")
|
|
212
|
+
|
|
213
|
+
# Read and process the input file.
|
|
214
|
+
df = pd.read_csv(input_file)
|
|
215
|
+
df['continuous_month'] = (df["year"]) * 12 + df["month"] + 1
|
|
216
|
+
if reassign_intervention_time:
|
|
217
|
+
df = reassign_by_intervention(df)
|
|
218
|
+
df = process_genetic_data(df)
|
|
219
|
+
run_summary = combined_summaries(df)
|
|
220
|
+
|
|
221
|
+
# Construct output file path.
|
|
222
|
+
output_file = os.path.join(output_summary_dir, f"{output_name}_summary.csv")
|
|
223
|
+
run_summary.to_csv(output_file, index=False)
|
|
224
|
+
|
|
225
|
+
return output_file
|
|
226
|
+
|
|
227
|
+
def process_all_files(file_list_path, output_summary_dir, max_workers=None):
|
|
228
|
+
# Read the file list CSV.
|
|
229
|
+
file_list_df = pd.read_csv(file_list_path)
|
|
230
|
+
|
|
231
|
+
# Ensure the output directory exists.
|
|
232
|
+
os.makedirs(output_summary_dir, exist_ok=True)
|
|
233
|
+
|
|
234
|
+
# Create a list of rows to process.
|
|
235
|
+
# Using DataFrame.iterrows() returns each row as a Series.
|
|
236
|
+
file_rows = [row for _, row in file_list_df.iterrows()]
|
|
237
|
+
|
|
238
|
+
# Use ProcessPoolExecutor to parallelize file processing.
|
|
239
|
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
240
|
+
future_to_file = {executor.submit(process_file, row, output_summary_dir): row['output_name']
|
|
241
|
+
for row in file_rows[:1]}
|
|
242
|
+
|
|
243
|
+
# Optionally, process the results as they complete.
|
|
244
|
+
for future in as_completed(future_to_file):
|
|
245
|
+
try:
|
|
246
|
+
result = future.result()
|
|
247
|
+
print(f"Processed: {result}")
|
|
248
|
+
except Exception as exc:
|
|
249
|
+
output_name = future_to_file[future]
|
|
250
|
+
print(f"{output_name} generated an exception: {exc}")
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
project_dir = "/mnt/data/malaria/synthetic_genomes/jessica_projects/2504_GRSweep"
|
|
254
|
+
file_list_path = os.path.join(project_dir, "sim_mapping_new_itns_6yr.csv")
|
|
255
|
+
# A folder for saving output summary files.
|
|
256
|
+
output_summary_dir = os.path.join(project_dir, "infectionFPGReport_summaries_6yr") # Change to your desired output directory.
|
|
257
|
+
os.makedirs(output_summary_dir, exist_ok=True)
|
|
258
|
+
|
|
259
|
+
process_all_files(file_list_path, output_summary_dir, max_workers=None)
|
|
260
|
+
|
|
261
|
+
|