pelican-nlp 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/Nils_backup/__init__.py +0 -0
- pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
- pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
- pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
- pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
- pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
- pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
- pelican_nlp/Nils_backup/fluency/config.py +231 -0
- pelican_nlp/Nils_backup/fluency/main.py +182 -0
- pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
- pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
- pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
- pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
- pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
- pelican_nlp/Nils_backup/fluency/utils.py +41 -0
- pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
- pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
- pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
- pelican_nlp/Nils_backup/transcription/test.json +1 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
- pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
- pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
- pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
- pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
- pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
- pelican_nlp/__init__.py +1 -1
- pelican_nlp/_version.py +1 -0
- pelican_nlp/configuration_files/config_audio.yml +150 -0
- pelican_nlp/configuration_files/config_discourse.yml +104 -0
- pelican_nlp/configuration_files/config_fluency.yml +108 -0
- pelican_nlp/configuration_files/config_general.yml +131 -0
- pelican_nlp/configuration_files/config_morteza.yml +103 -0
- pelican_nlp/praat/__init__.py +29 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +4 -3
- pelican_nlp-0.1.2.dist-info/RECORD +75 -0
- pelican_nlp-0.1.1.dist-info/RECORD +0 -39
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,930 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from scipy.stats import pearsonr, shapiro
|
3
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
4
|
+
from sklearn.compose import ColumnTransformer
|
5
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
6
|
+
import statsmodels.api as sm
|
7
|
+
import pandas as pd
|
8
|
+
import pickle
|
9
|
+
from plot_fluency import plot_regression_with_levels, plot_regression_coefficients, compare_groups, compare_groups_single_plot_regression, plot_stepwise_regression_results
|
10
|
+
|
11
|
+
from plotting_utils import (
|
12
|
+
GROUP_DICT as group_dict,
|
13
|
+
NAMES as names,
|
14
|
+
COG_VAR as cog_var,
|
15
|
+
format_p_value,
|
16
|
+
set_size
|
17
|
+
)
|
18
|
+
|
19
|
+
from config import CONFIG
|
20
|
+
import os
|
21
|
+
|
22
|
+
# Use configuration from config.py
|
23
|
+
diag_dict = CONFIG["stats"]["groups"]
|
24
|
+
|
25
|
+
# Create results directories if they don't exist
|
26
|
+
os.makedirs(CONFIG["stats"]["paths"]["results_dir"], exist_ok=True)
|
27
|
+
os.makedirs(CONFIG["stats"]["paths"]["figures_dir"], exist_ok=True)
|
28
|
+
|
29
|
+
def apply_normality_test(df):
|
30
|
+
"""
|
31
|
+
Applies the Shapiro-Wilk normality test to each column of the DataFrame after dropping NaN values.
|
32
|
+
Returns a DataFrame with the test statistics and p-values.
|
33
|
+
"""
|
34
|
+
results = {'Column': [], 'Statistic': [], 'P-Value': []}
|
35
|
+
|
36
|
+
for column in df.columns:
|
37
|
+
# Ensure the column is numeric
|
38
|
+
if pd.api.types.is_numeric_dtype(df[column]):
|
39
|
+
# Drop NaN values
|
40
|
+
cleaned_data = df[column].dropna()
|
41
|
+
if len(cleaned_data) > 0: # Ensure there is data to test
|
42
|
+
stat, p_value = shapiro(cleaned_data)
|
43
|
+
results['Column'].append(column)
|
44
|
+
results['Statistic'].append(stat)
|
45
|
+
results['P-Value'].append(p_value)
|
46
|
+
else:
|
47
|
+
results['Column'].append(column)
|
48
|
+
results['Statistic'].append(None)
|
49
|
+
results['P-Value'].append(None)
|
50
|
+
else:
|
51
|
+
results['Column'].append(column)
|
52
|
+
results['Statistic'].append(None)
|
53
|
+
results['P-Value'].append(None)
|
54
|
+
|
55
|
+
return pd.DataFrame(results)
|
56
|
+
|
57
|
+
|
58
|
+
def calculate_vif(X):
|
59
|
+
"""
|
60
|
+
Calculate the VIF for each feature in the DataFrame X.
|
61
|
+
"""
|
62
|
+
vif_data = pd.DataFrame()
|
63
|
+
vif_data["Variable"] = X.columns
|
64
|
+
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
|
65
|
+
vif_data["Variable"] = vif_data["Variable"].apply(lambda x: names[x])
|
66
|
+
return vif_data
|
67
|
+
|
68
|
+
|
69
|
+
def hierarchical_regression_with_vif(data, metrics, scores, control):
|
70
|
+
results = []
|
71
|
+
scaler = StandardScaler()
|
72
|
+
|
73
|
+
for score in scores:
|
74
|
+
for metric in metrics:
|
75
|
+
# Scale the data
|
76
|
+
X1_scaled = scaler.fit_transform(data[control])
|
77
|
+
y_scaled = scaler.fit_transform(data[[score]])
|
78
|
+
|
79
|
+
X1 = sm.add_constant(X1_scaled)
|
80
|
+
model1 = sm.OLS(y_scaled, X1).fit()
|
81
|
+
|
82
|
+
X2_list = ([control] if isinstance(control, str) else control) + ([metric] if isinstance(metric, str) else metric)
|
83
|
+
X2_scaled = scaler.fit_transform(data[X2_list])
|
84
|
+
|
85
|
+
X2 = sm.add_constant(X2_scaled)
|
86
|
+
model2 = sm.OLS(y_scaled, X2).fit()
|
87
|
+
|
88
|
+
# Calculate VIF for the full model
|
89
|
+
vif = calculate_vif(pd.DataFrame(X2, columns=['const'] + X2_list))
|
90
|
+
|
91
|
+
r2_adj_change = model2.rsquared_adj - model1.rsquared_adj
|
92
|
+
r2_change = model2.rsquared - model1.rsquared
|
93
|
+
anova_results = sm.stats.anova_lm(model1, model2)
|
94
|
+
f_value = anova_results['F'][1]
|
95
|
+
p_value = anova_results['Pr(>F)'][1]
|
96
|
+
|
97
|
+
df1_control = len(control) # Number of predictors for this outcome
|
98
|
+
df2_control = len(data) - df1_control - 1 # Residual degrees of freedom
|
99
|
+
|
100
|
+
df1_full = len(control) + 1 # Number of predictors for this outcome
|
101
|
+
df2_full = len(data) - df1_full - 1 # Residual degrees of freedom
|
102
|
+
|
103
|
+
results.append({
|
104
|
+
'psychiatric_score': score,
|
105
|
+
'control': control,
|
106
|
+
'metric': metric,
|
107
|
+
'r2_adj_control': model1.rsquared_adj,
|
108
|
+
'r2_adj_full': model2.rsquared_adj,
|
109
|
+
'r2_adj_change': r2_adj_change,
|
110
|
+
'r2_control': model1.rsquared,
|
111
|
+
'df_control': f"({df1_control}, {df2_control})",
|
112
|
+
'df_full': f"({df1_full}, {df2_full})",
|
113
|
+
'r2_full': model2.rsquared,
|
114
|
+
'r2_change': r2_change,
|
115
|
+
'f_value': f_value,
|
116
|
+
'p_value': p_value,
|
117
|
+
'vif': vif.set_index('Variable')['VIF'].to_dict(), # Save VIF as a dictionary
|
118
|
+
'model1': dict(zip(control, model1.params.flatten())), # Convert to a dictionary
|
119
|
+
'model2': dict(zip(X2_list, model2.params.flatten())), # Convert to a dictionary
|
120
|
+
})
|
121
|
+
|
122
|
+
results_df = pd.DataFrame(results)
|
123
|
+
return results_df
|
124
|
+
|
125
|
+
def report_stepwise_regression(df):
|
126
|
+
"""
|
127
|
+
Generate compact APA-style formatted strings from the result of stepwise regression.
|
128
|
+
|
129
|
+
Parameters:
|
130
|
+
- df: DataFrame containing the stepwise regression results.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
- A list of formatted strings for each row of the DataFrame.
|
134
|
+
"""
|
135
|
+
formatted_strings = {}
|
136
|
+
|
137
|
+
for index, row in df.iterrows():
|
138
|
+
# Extract the relevant data from the row
|
139
|
+
metric = row['metric']
|
140
|
+
r2_adj_control = row['r2_adj_control'] # Adjusted R² for control model
|
141
|
+
r2_adj_full = row['r2_adj_full'] # Adjusted R² for full model
|
142
|
+
f_value = row['f_value'] # F statistic
|
143
|
+
p_value = row['p_value'] # p-value
|
144
|
+
df_control = row['df_control']
|
145
|
+
df_full = row['df_full']
|
146
|
+
# Format the string in compact APA style
|
147
|
+
formatted_strings[names[metric]] = (
|
148
|
+
f"R² control {df_control} = {r2_adj_control:.3f}, R² full {df_full} = {r2_adj_full:.3f}, F = {f_value:.2f}, {format_p_value(p_value)}."
|
149
|
+
)
|
150
|
+
|
151
|
+
return formatted_strings
|
152
|
+
|
153
|
+
|
154
|
+
def regress_out_demographics(df, outcomes=None, demographics=None, names=None, group_filter=None):
|
155
|
+
"""
|
156
|
+
Regress out demographic variables from each outcome separately.
|
157
|
+
The original outcome columns are scaled and then replaced with residuals,
|
158
|
+
while keeping the rest of the DataFrame intact.
|
159
|
+
|
160
|
+
Parameters:
|
161
|
+
- df: Original DataFrame containing the full dataset.
|
162
|
+
- outcomes: List of outcome columns to scale and regress out demographics from. Defaults to all outcomes from config.
|
163
|
+
- demographics: List of demographic variables. Defaults to demographics from config.
|
164
|
+
- names: Dictionary mapping column names to display names
|
165
|
+
- group_filter: If specified, regress out demographics within each group separately.
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
- Tuple containing the modified DataFrame with residuals and a dictionary of regression results.
|
169
|
+
"""
|
170
|
+
# Use defaults from config if not provided
|
171
|
+
if outcomes is None:
|
172
|
+
outcomes = CONFIG["stats"]["outcomes"]["clinical"] + CONFIG["stats"]["outcomes"]["cognitive"]
|
173
|
+
if demographics is None:
|
174
|
+
demographics = CONFIG["stats"]["demographics"]
|
175
|
+
|
176
|
+
df_residuals = df.copy()
|
177
|
+
|
178
|
+
# Initialize placeholders for results
|
179
|
+
results = {
|
180
|
+
'coefficients': pd.DataFrame(index=[], columns=outcomes),
|
181
|
+
'ci_lower': pd.DataFrame(index=[], columns=outcomes),
|
182
|
+
'ci_upper': pd.DataFrame(index=[], columns=outcomes),
|
183
|
+
'p_values': pd.DataFrame(index=[], columns=outcomes),
|
184
|
+
'f_statistic': pd.Series(index=outcomes, dtype=float),
|
185
|
+
'r_squared': pd.Series(index=outcomes, dtype=float),
|
186
|
+
'adj_r_squared': pd.Series(index=outcomes, dtype=float),
|
187
|
+
'aic': pd.Series(index=outcomes, dtype=float),
|
188
|
+
'bic': pd.Series(index=outcomes, dtype=float),
|
189
|
+
'model_p_value': pd.Series(index=outcomes, dtype=float)
|
190
|
+
}
|
191
|
+
|
192
|
+
# Identify categorical and numeric demographics
|
193
|
+
numeric_features = df[demographics].select_dtypes(include=[np.number]).columns.tolist()
|
194
|
+
categorical_features = df[demographics].select_dtypes(exclude=[np.number]).columns.tolist()
|
195
|
+
|
196
|
+
# Preprocessing pipeline: scale numeric, one-hot encode categorical
|
197
|
+
transformers = [('num', StandardScaler(), numeric_features)]
|
198
|
+
if categorical_features:
|
199
|
+
transformers.append(('cat', OneHotEncoder(drop='first', sparse=False), categorical_features))
|
200
|
+
|
201
|
+
preprocessor = ColumnTransformer(transformers, remainder='passthrough')
|
202
|
+
|
203
|
+
# Scale the outcomes explicitly
|
204
|
+
outcome_scaler = StandardScaler()
|
205
|
+
|
206
|
+
if group_filter:
|
207
|
+
# Apply group-wise regression for each subgroup
|
208
|
+
for group in df[group_filter].unique():
|
209
|
+
group_data = df[df[group_filter] == group]
|
210
|
+
for outcome in outcomes:
|
211
|
+
# Scale the outcome data before regression
|
212
|
+
y_scaled = outcome_scaler.fit_transform(group_data[[outcome]]).flatten()
|
213
|
+
|
214
|
+
X = preprocessor.fit_transform(group_data[demographics])
|
215
|
+
X = sm.add_constant(X) # Add constant (intercept)
|
216
|
+
|
217
|
+
# Extract feature names from the ColumnTransformer
|
218
|
+
feature_names = ['const'] + numeric_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
|
219
|
+
|
220
|
+
# Fit demographic model and calculate residuals for this outcome
|
221
|
+
model = sm.OLS(y_scaled, X).fit()
|
222
|
+
df_residuals.loc[group_data.index, outcome] = model.resid
|
223
|
+
|
224
|
+
# Store coefficients, confidence intervals, p-values
|
225
|
+
for i, name in enumerate(feature_names):
|
226
|
+
if name != 'const': # Ignore the constant for demographics results
|
227
|
+
results['coefficients'].loc[name, outcome] = model.params[i]
|
228
|
+
results['ci_lower'].loc[name, outcome] = model.conf_int()[i, 0]
|
229
|
+
results['ci_upper'].loc[name, outcome] = model.conf_int()[i, 1]
|
230
|
+
results['p_values'].loc[name, outcome] = model.pvalues[i]
|
231
|
+
|
232
|
+
# Store model metrics
|
233
|
+
results['f_statistic'].loc[outcome] = model.fvalue
|
234
|
+
results['r_squared'].loc[outcome] = model.rsquared
|
235
|
+
results['adj_r_squared'].loc[outcome] = model.rsquared_adj
|
236
|
+
results['aic'].loc[outcome] = model.aic
|
237
|
+
results['bic'].loc[outcome] = model.bic
|
238
|
+
results['model_p_value'].loc[outcome] = model.f_pvalue
|
239
|
+
|
240
|
+
else:
|
241
|
+
# Global regression (apply regression for each outcome separately)
|
242
|
+
for outcome in outcomes:
|
243
|
+
# Scale the outcome data before regression
|
244
|
+
y_scaled = outcome_scaler.fit_transform(df[[outcome]]).flatten()
|
245
|
+
X = preprocessor.fit_transform(df[demographics])
|
246
|
+
X = sm.add_constant(X) # Add constant (intercept)
|
247
|
+
|
248
|
+
# Extract feature names from the ColumnTransformer
|
249
|
+
feature_names = ['const'] + numeric_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
|
250
|
+
|
251
|
+
# Fit demographic model and calculate residuals for this outcome
|
252
|
+
model = sm.OLS(y_scaled, X).fit()
|
253
|
+
df_residuals[outcome] = model.resid
|
254
|
+
|
255
|
+
# Store coefficients, confidence intervals, p-values
|
256
|
+
for i, name in enumerate(feature_names):
|
257
|
+
if name != 'const': # Ignore the constant for demographics results
|
258
|
+
results['coefficients'].loc[name, outcome] = model.params[i]
|
259
|
+
results['ci_lower'].loc[name, outcome] = model.conf_int()[i, 0]
|
260
|
+
results['ci_upper'].loc[name, outcome] = model.conf_int()[i, 1]
|
261
|
+
results['p_values'].loc[name, outcome] = model.pvalues[i]
|
262
|
+
|
263
|
+
# Store model metrics
|
264
|
+
results['f_statistic'].loc[outcome] = model.fvalue
|
265
|
+
results['r_squared'].loc[outcome] = model.rsquared
|
266
|
+
results['adj_r_squared'].loc[outcome] = model.rsquared_adj
|
267
|
+
results['aic'].loc[outcome] = model.aic
|
268
|
+
results['bic'].loc[outcome] = model.bic
|
269
|
+
results['model_p_value'].loc[outcome] = model.f_pvalue
|
270
|
+
|
271
|
+
if names:
|
272
|
+
for result_df in [results['coefficients'], results['ci_lower'], results['ci_upper'], results['p_values']]:
|
273
|
+
result_df.rename(columns=names, index=names, inplace=True)
|
274
|
+
for results_series in [results['f_statistic'], results['r_squared'], results['adj_r_squared'], results['aic'], results['bic'], results['model_p_value']]:
|
275
|
+
results_series.rename(index=names, inplace=True)
|
276
|
+
|
277
|
+
# Return the modified DataFrame with residuals and regression results
|
278
|
+
return df_residuals, results
|
279
|
+
|
280
|
+
def process_regression_on_residuals(df, outcome_cols, predictors, names, group_filter=None):
|
281
|
+
"""
|
282
|
+
Perform regression on residuals (which are now in the original outcome columns), rename columns and index,
|
283
|
+
and format p-values, optionally filtering by group.
|
284
|
+
|
285
|
+
Parameters:
|
286
|
+
- df: DataFrame containing residuals in the original outcome columns.
|
287
|
+
- outcome_cols: List of original outcome columns (now containing residuals).
|
288
|
+
- predictors: List of predictor variables.
|
289
|
+
- names: Dictionary of names for renaming columns and index.
|
290
|
+
- prefix: Optional string prefix to add to formatted column names and dictionary keys.
|
291
|
+
- group_filter: If specified, filter the DataFrame by this group before running the regression.
|
292
|
+
|
293
|
+
Returns:
|
294
|
+
- Dictionary containing coefficients, confidence intervals (lower and upper), p-values, formatted results, and other metrics.
|
295
|
+
"""
|
296
|
+
|
297
|
+
# Apply the group filter if provided
|
298
|
+
if group_filter:
|
299
|
+
if 'group' in df.columns:
|
300
|
+
df = df[df['group'] == group_filter]
|
301
|
+
else:
|
302
|
+
raise ValueError(f"The 'group' column does not exist in the DataFrame. Check your group filter.")
|
303
|
+
|
304
|
+
# Perform the multivariable regression on the residuals
|
305
|
+
coeff, coeff_lower, coeff_upper, p_values, f_stat, r_squared, adj_r_squared, aic, bic, model_p_value = multivariable_regression_with_residuals(
|
306
|
+
df[outcome_cols], df, predictors
|
307
|
+
)
|
308
|
+
|
309
|
+
# Apply renaming for coefficients, confidence intervals, and p-values
|
310
|
+
for result_df in [coeff, coeff_lower, coeff_upper, p_values]:
|
311
|
+
result_df.rename(columns=names, index=names, inplace=True)
|
312
|
+
for results_series in [f_stat, r_squared, adj_r_squared, aic, bic, model_p_value]:
|
313
|
+
results_series.rename(index=names, inplace=True)
|
314
|
+
|
315
|
+
# Create the result dictionary with dynamic keys
|
316
|
+
result_dict = {
|
317
|
+
"coefficients": coeff,
|
318
|
+
"ci_lower": coeff_lower,
|
319
|
+
"ci_upper": coeff_upper,
|
320
|
+
"p_values": p_values,
|
321
|
+
"f_statistic": f_stat,
|
322
|
+
"r_squared": r_squared,
|
323
|
+
"adj_r_squared": adj_r_squared,
|
324
|
+
"aic": aic,
|
325
|
+
"bic": bic,
|
326
|
+
"model_p_value": model_p_value
|
327
|
+
}
|
328
|
+
|
329
|
+
return result_dict
|
330
|
+
|
331
|
+
def multivariable_regression_with_residuals(residuals, df, predictors):
|
332
|
+
"""
|
333
|
+
Perform multivariable regression using residuals as outcomes.
|
334
|
+
Handles scaling of numeric predictors.
|
335
|
+
|
336
|
+
Parameters:
|
337
|
+
- residuals: DataFrame of residuals from regressing out demographics.
|
338
|
+
- df: The original DataFrame containing all data, including predictors.
|
339
|
+
- predictors: List of numeric predictors for the multivariable regression.
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
- DataFrames of coefficients, confidence intervals, p-values, F-statistics, R-squared, Adjusted R-squared, and overall p-values.
|
343
|
+
- Dictionary containing the preprocessed predictor matrix (X_preprocessed) for each outcome.
|
344
|
+
"""
|
345
|
+
results = {
|
346
|
+
'coefficients': pd.DataFrame(index=predictors, columns=residuals.columns),
|
347
|
+
'ci_lower': pd.DataFrame(index=predictors, columns=residuals.columns),
|
348
|
+
'ci_upper': pd.DataFrame(index=predictors, columns=residuals.columns),
|
349
|
+
'p_values': pd.DataFrame(index=predictors, columns=residuals.columns),
|
350
|
+
'f_statistic': pd.Series(index=residuals.columns),
|
351
|
+
'r_squared': pd.Series(index=residuals.columns),
|
352
|
+
'adj_r_squared': pd.Series(index=residuals.columns),
|
353
|
+
'aic': pd.Series(index=residuals.columns),
|
354
|
+
'bic': pd.Series(index=residuals.columns),
|
355
|
+
'model_p_value': pd.Series(index=residuals.columns) # New addition for overall p-values
|
356
|
+
}
|
357
|
+
|
358
|
+
for outcome in residuals.columns:
|
359
|
+
# Combine residuals and predictors to ensure they are aligned
|
360
|
+
combined = pd.concat([residuals[outcome], df[predictors]], axis=1).dropna()
|
361
|
+
|
362
|
+
# Check if combined data has sufficient rows
|
363
|
+
if combined.shape[0] < len(predictors) + 1:
|
364
|
+
print(f"Warning: Not enough data points to perform regression for outcome {outcome}. Skipping...")
|
365
|
+
continue
|
366
|
+
|
367
|
+
# Separate the aligned residuals and predictors
|
368
|
+
y = combined[outcome]
|
369
|
+
X = combined[predictors]
|
370
|
+
|
371
|
+
# Scale the numeric predictors
|
372
|
+
scaler = StandardScaler()
|
373
|
+
X_scaled = scaler.fit_transform(X)
|
374
|
+
|
375
|
+
# Add a constant (intercept) term
|
376
|
+
X_scaled = sm.add_constant(X_scaled)
|
377
|
+
|
378
|
+
# Fit the regression model
|
379
|
+
model = sm.OLS(y, X_scaled).fit()
|
380
|
+
|
381
|
+
# Map the model's parameter names (e.g., x1, x2) back to the original predictor names
|
382
|
+
params = model.params
|
383
|
+
conf_int = model.conf_int()
|
384
|
+
pvalues = model.pvalues
|
385
|
+
# Store coefficients, confidence intervals, and p-values (excluding the constant term)
|
386
|
+
for i, predictor in enumerate(predictors):
|
387
|
+
results['coefficients'].loc[predictor, outcome] = params[i+1] # Skip constant
|
388
|
+
results['ci_lower'].loc[predictor, outcome] = conf_int.iloc[i+1, 0]
|
389
|
+
results['ci_upper'].loc[predictor, outcome] = conf_int.iloc[i+1, 1]
|
390
|
+
results['p_values'].loc[predictor, outcome] = pvalues[i+1] # Skip constant
|
391
|
+
|
392
|
+
# Store model metrics
|
393
|
+
results['f_statistic'].loc[outcome] = model.fvalue
|
394
|
+
results['r_squared'].loc[outcome] = model.rsquared
|
395
|
+
results['adj_r_squared'].loc[outcome] = model.rsquared_adj
|
396
|
+
results['aic'].loc[outcome] = model.aic
|
397
|
+
results['bic'].loc[outcome] = model.bic
|
398
|
+
|
399
|
+
# Calculate the degrees of freedom
|
400
|
+
df_model = len(predictors) # Number of predictors
|
401
|
+
df_resid = model.df_resid # Residual degrees of freedom
|
402
|
+
|
403
|
+
# Calculate the overall p-value using the F-distribution
|
404
|
+
overall_p_value = model.f_pvalue
|
405
|
+
results['model_p_value'].loc[outcome] = overall_p_value
|
406
|
+
|
407
|
+
return (results['coefficients'].T,
|
408
|
+
results['ci_lower'].T,
|
409
|
+
results['ci_upper'].T,
|
410
|
+
results['p_values'].T,
|
411
|
+
results['f_statistic'],
|
412
|
+
results['r_squared'],
|
413
|
+
results['adj_r_squared'],
|
414
|
+
results['aic'],
|
415
|
+
results['bic'],
|
416
|
+
results['model_p_value'])
|
417
|
+
|
418
|
+
def format_regression_results_apa(results_dict, n):
|
419
|
+
"""
|
420
|
+
Format regression results into an APA-style dictionary for multiple outcome measures.
|
421
|
+
|
422
|
+
Parameters:
|
423
|
+
- results_dict: Dictionary containing regression results, including coefficients, confidence intervals, F-statistic, R-squared, overall p-value, etc.
|
424
|
+
- n: The number of observations (sample size) used in the regression.
|
425
|
+
|
426
|
+
Returns:
|
427
|
+
- A dictionary with outcome names as keys, where each key contains another dictionary with the model summary
|
428
|
+
and APA-formatted strings for each variable for that outcome.
|
429
|
+
"""
|
430
|
+
|
431
|
+
# Create a dictionary to store the output for multiple outcomes
|
432
|
+
final_result_dict = {}
|
433
|
+
|
434
|
+
# Iterate over each outcome in the result dictionary
|
435
|
+
for outcome in results_dict["f_statistic"].index:
|
436
|
+
|
437
|
+
# Extract necessary metrics for the current outcome
|
438
|
+
coefficients = results_dict["coefficients"].loc[outcome]
|
439
|
+
ci_lower = results_dict["ci_lower"].loc[outcome]
|
440
|
+
ci_upper = results_dict["ci_upper"].loc[outcome]
|
441
|
+
f_stat = results_dict["f_statistic"].loc[outcome] # F-statistic for the current outcome
|
442
|
+
r_squared = results_dict["r_squared"].loc[outcome] # R-squared for the current outcome
|
443
|
+
adj_r_squared = results_dict["adj_r_squared"].loc[outcome] # Adjusted R-squared for the current outcome
|
444
|
+
p_values = results_dict["p_values"].loc[outcome]
|
445
|
+
overall_p_value = results_dict["model_p_value"].loc[outcome] # Overall p-value for the current outcome
|
446
|
+
# Assuming df1 is the number of predictors and df2 is the number of observations minus predictors minus 1
|
447
|
+
df1 = len(coefficients) # Number of predictors for this outcome
|
448
|
+
df2 = n - df1 - 1 # Residual degrees of freedom
|
449
|
+
|
450
|
+
# Overall regression stats (APA formatted)
|
451
|
+
summary = (
|
452
|
+
f"R² = {r_squared:.2f}, F({df1}, {df2}) = {f_stat:.2f}, {format_p_value(overall_p_value)}"
|
453
|
+
)
|
454
|
+
|
455
|
+
# Create a dictionary for the current outcome to store the summary and coefficient details
|
456
|
+
outcome_result_dict = {"summary": summary}
|
457
|
+
|
458
|
+
for predictor in results_dict["coefficients"].columns:
|
459
|
+
coeff = results_dict["coefficients"].loc[outcome,predictor] # Extract scalar values
|
460
|
+
ci_l = results_dict["ci_lower"].loc[outcome, predictor] # Extract scalar values
|
461
|
+
ci_u = results_dict["ci_upper"].loc[outcome, predictor] # Extract scalar values
|
462
|
+
p_val = results_dict["p_values"].loc[outcome, predictor] # Extract scalar values
|
463
|
+
|
464
|
+
# APA style for variable coefficients with confidence intervals
|
465
|
+
outcome_result_dict[predictor] = (
|
466
|
+
f"β = {coeff:.2f}, 95% CI [{ci_l:.2f}, {ci_u:.2f}], {format_p_value(p_val)}"
|
467
|
+
)
|
468
|
+
|
469
|
+
# Store the result for this outcome in the final dictionary
|
470
|
+
final_result_dict[outcome] = outcome_result_dict
|
471
|
+
|
472
|
+
return final_result_dict
|
473
|
+
|
474
|
+
|
475
|
+
def correlation_apa(df, columns, names):
|
476
|
+
"""
|
477
|
+
Compute correlations between the specified columns in a DataFrame and return results in APA style,
|
478
|
+
while renaming the index and columns using the `names` dictionary.
|
479
|
+
|
480
|
+
Parameters:
|
481
|
+
- df: pandas DataFrame containing the data.
|
482
|
+
- columns: List of column names to compute correlations for.
|
483
|
+
- names: Dictionary for renaming the index and columns.
|
484
|
+
|
485
|
+
Returns:
|
486
|
+
- A pandas DataFrame with APA-formatted correlation results.
|
487
|
+
"""
|
488
|
+
|
489
|
+
# Prepare an empty DataFrame to store results
|
490
|
+
apa_corr = pd.DataFrame(index=columns, columns=columns)
|
491
|
+
|
492
|
+
# Compute correlations for each pair of columns
|
493
|
+
for i, col1 in enumerate(columns):
|
494
|
+
for j, col2 in enumerate(columns):
|
495
|
+
# Compute Pearson correlation
|
496
|
+
r, p_value = pearsonr(df[col1], df[col2])
|
497
|
+
|
498
|
+
# Degrees of freedom
|
499
|
+
n = df[[col1, col2]].dropna().shape[0] # Exclude missing data
|
500
|
+
df_corr = n - 2
|
501
|
+
|
502
|
+
# Format in APA style: r(df) = r_value, p = p_value
|
503
|
+
apa_corr.loc[col1, col2] = f"r ({df_corr}) = {r:.2f}, {format_p_value(p_value)}"
|
504
|
+
|
505
|
+
# Apply the renaming using the `names` dictionary
|
506
|
+
apa_corr.rename(index=names, columns=names, inplace=True)
|
507
|
+
|
508
|
+
return apa_corr
|
509
|
+
|
510
|
+
def report_categorical_regression(df):
|
511
|
+
"""
|
512
|
+
Generate compact APA-style formatted strings from a categorical univariable regression table.
|
513
|
+
|
514
|
+
Parameters:
|
515
|
+
- df: DataFrame containing the regression results.
|
516
|
+
|
517
|
+
Returns:
|
518
|
+
- A list of compact formatted strings for each metric.
|
519
|
+
"""
|
520
|
+
formatted_strings = {}
|
521
|
+
|
522
|
+
for metric in df['metric'].unique():
|
523
|
+
# Filter rows for the current metric
|
524
|
+
metric_df = df[df['metric'] == metric]
|
525
|
+
|
526
|
+
# Extract intercept information
|
527
|
+
intercept_row = metric_df[metric_df['group'] == 'Intercept'].iloc[0]
|
528
|
+
intercept = intercept_row['coef']
|
529
|
+
r_squared = intercept_row['r_squared']
|
530
|
+
model_p = format_p_value(intercept_row['p_model'])
|
531
|
+
dof = intercept_row['df']
|
532
|
+
f_stat = intercept_row['f_stat']
|
533
|
+
# Start constructing the compact APA-style string
|
534
|
+
formatted_string = f"R² = {r_squared:.2f}, F {dof} = {f_stat:.2f} {model_p}; Intercept = {intercept:.2f}, "
|
535
|
+
|
536
|
+
# Loop through each group comparison (excluding the intercept)
|
537
|
+
group_strings = []
|
538
|
+
for _, row in metric_df[metric_df['group'] != 'Intercept'].iterrows():
|
539
|
+
group = row['group'].split('[')[-1].strip(']')[2:]
|
540
|
+
coef = row['coef']
|
541
|
+
p_value = row['p']
|
542
|
+
group_strings.append(f"{group} = {coef:.2f}, {format_p_value(p_value)}")
|
543
|
+
|
544
|
+
# Combine group results and add the R² value
|
545
|
+
formatted_string += "; ".join(group_strings)
|
546
|
+
|
547
|
+
# Append the formatted string to the list
|
548
|
+
formatted_strings[metric] = formatted_string
|
549
|
+
|
550
|
+
return formatted_strings
|
551
|
+
|
552
|
+
def categorize_first_language(lang):
|
553
|
+
"""Categorize first language into 'German', 'Both', or 'Other'."""
|
554
|
+
if lang.lower() == 'german':
|
555
|
+
return 'German'
|
556
|
+
elif lang.lower() == 'both':
|
557
|
+
return 'German'
|
558
|
+
else:
|
559
|
+
return 'Other'
|
560
|
+
|
561
|
+
def format_combined_table(summary_patients, summary_controls):
|
562
|
+
# Prepare shared variables in combined table
|
563
|
+
combined_df = summary_patients[['group', 'n']].copy()
|
564
|
+
combined_df['Age, mean (SD)'] = summary_patients.apply(lambda x: f"{x['age_mean']:.2f} ({x['age_sd']:.2f})", axis=1)
|
565
|
+
combined_df['Gender, Male'] = summary_patients.apply(lambda x: f"{x['gender_male']} ({x['gender_male_pct']:.2f}%)", axis=1)
|
566
|
+
combined_df['Gender, Female'] = summary_patients.apply(lambda x: f"{x['gender_female']} ({x['gender_female_pct']:.2f}%)", axis=1)
|
567
|
+
combined_df['Education Years, mean (SD)'] = summary_patients.apply(lambda x: f"{x['education_mean']:.2f} ({x['education_sd']:.2f})", axis=1)
|
568
|
+
|
569
|
+
|
570
|
+
# Add first language counts and percentages
|
571
|
+
combined_df['1st Language: German'] = summary_patients.apply(lambda x: f"{x['first_lang_german']} ({x['first_lang_german_pct']:.2f}%)", axis=1)
|
572
|
+
combined_df['1st Language: Bilingual'] = summary_patients.apply(lambda x: f"{x['first_lang_both']} ({x['first_lang_both_pct']:.2f}%)", axis=1)
|
573
|
+
combined_df['1st Language: Other'] = summary_patients.apply(lambda x: f"{x['first_lang_other']} ({x['first_lang_other_pct']:.2f}%)", axis=1)
|
574
|
+
|
575
|
+
# Add PANSS for patients and MSS for controls
|
576
|
+
combined_df['PANSS, mean (SD)'] = summary_patients.apply(lambda x: f"{x['panss_total_mean']:.2f} ({x['panss_total_sd']:.2f})", axis=1)
|
577
|
+
combined_df['MSS, mean (SD)'] = summary_patients.apply(lambda x: f"{x['mss_total_mean']:.2f} ({x['mss_total_sd']:.2f})", axis=1)
|
578
|
+
|
579
|
+
# Add controls to the combined table
|
580
|
+
combined_df_controls = summary_controls[['group', 'n']].copy()
|
581
|
+
|
582
|
+
combined_df_controls['Age, mean (SD)'] = summary_controls.apply(lambda x: f"{x['age_mean']:.2f} ({x['age_sd']:.2f})", axis=1)
|
583
|
+
combined_df_controls['Gender, Male'] = summary_controls.apply(lambda x: f"{x['gender_male']} ({x['gender_male_pct']:.2f}%)", axis=1)
|
584
|
+
combined_df_controls['Gender, Female'] = summary_controls.apply(lambda x: f"{x['gender_female']} ({x['gender_female_pct']:.2f}%)", axis=1)
|
585
|
+
|
586
|
+
combined_df_controls['Education Years, mean (SD)'] = summary_controls.apply(lambda x: f"{x['education_mean']:.2f} ({x['education_sd']:.2f})", axis=1)
|
587
|
+
|
588
|
+
combined_df_controls['1st Language: German'] = summary_controls.apply(lambda x: f"{x['first_lang_german']} ({x['first_lang_german_pct']:.2f}%)", axis=1)
|
589
|
+
combined_df_controls['1st Language: Bilingual'] = summary_controls.apply(lambda x: f"{x['first_lang_both']} ({x['first_lang_both_pct']:.2f}%)", axis=1)
|
590
|
+
combined_df_controls['1st Language: Other'] = summary_controls.apply(lambda x: f"{x['first_lang_other']} ({x['first_lang_other_pct']:.2f}%)", axis=1)
|
591
|
+
|
592
|
+
# Add PANSS for patients and MSS for controls
|
593
|
+
combined_df_controls['PANSS, mean (SD)'] = summary_controls.apply(lambda x: f"{x['panss_total_mean']:.2f} ({x['panss_total_sd']:.2f})", axis=1)
|
594
|
+
combined_df_controls['MSS, mean (SD)'] = summary_controls.apply(lambda x: f"{x['mss_total_mean']:.2f} ({x['mss_total_sd']:.2f})", axis=1)
|
595
|
+
|
596
|
+
|
597
|
+
combined_table = pd.concat([combined_df, combined_df_controls], ignore_index=True)
|
598
|
+
|
599
|
+
return combined_table
|
600
|
+
|
601
|
+
|
602
|
+
def format_breakout_table(df_patients):
|
603
|
+
breakout_df = df_patients[['group']].copy()
|
604
|
+
|
605
|
+
|
606
|
+
breakout_df['Duration Untreated, mean (SD)'] = df_patients.apply(lambda x: f"{x['duration_untreated_mean']:.2f} ({x['duration_untreated_sd']:.2f})", axis=1)
|
607
|
+
breakout_df['Age of Onset, mean (SD)'] = df_patients.apply(lambda x: f"{x['age_onset_mean']:.2f} ({x['age_onset_sd']:.2f})", axis=1)
|
608
|
+
breakout_df['Antipsychotic Treatment (weeks), mean (SD)'] = df_patients.apply(lambda x: f"{x['antipsy_duration_mean']:.2f} ({x['antipsy_duration_sd']:.2f})", axis=1)
|
609
|
+
breakout_df["**Diagnosis:**"] = ""
|
610
|
+
# Add diagnosis percentages
|
611
|
+
diagnosis_cols = ['Schizophrenia', 'Brief psychotic disorder', 'Schizoaffective disorders', 'MDD with psychotic symptoms', 'other']
|
612
|
+
for col in diagnosis_cols:
|
613
|
+
if col in df_patients.columns:
|
614
|
+
breakout_df[col] = df_patients[col].apply(lambda x: '-' if x == 0 else f"{x:.1f}%" if isinstance(x, (int, float)) else x)
|
615
|
+
|
616
|
+
return breakout_df
|
617
|
+
|
618
|
+
def summary_table(df):
|
619
|
+
# Categorize the first language column
|
620
|
+
df['first_language_category'] = df['first_language'].apply(categorize_first_language)
|
621
|
+
|
622
|
+
# Convert numeric columns to float
|
623
|
+
numeric_cols = ['age', 'education', 'duration_untreated', 'age_onset', 'antipsy_duration', 'panss_total', 'mss_total']
|
624
|
+
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
|
625
|
+
|
626
|
+
# Data preparation
|
627
|
+
df_patients = df[df["group"] == "patient"].groupby("study_id")[[
|
628
|
+
"group",
|
629
|
+
"age",
|
630
|
+
"gender",
|
631
|
+
"education",
|
632
|
+
"first_language_category",
|
633
|
+
"diagnosis",
|
634
|
+
'duration_untreated',
|
635
|
+
'age_onset',
|
636
|
+
'antipsy_duration',
|
637
|
+
"panss_total",
|
638
|
+
"mss_total"
|
639
|
+
]].first().reset_index().drop("study_id", axis=1)
|
640
|
+
|
641
|
+
df_controls = df[df["group"] != "patient"].groupby("study_id")[[
|
642
|
+
"group",
|
643
|
+
"age",
|
644
|
+
"gender",
|
645
|
+
"education",
|
646
|
+
"first_language_category",
|
647
|
+
"panss_total",
|
648
|
+
"mss_total"
|
649
|
+
]].first().reset_index().drop("study_id", axis=1)
|
650
|
+
|
651
|
+
df_patients["diagnosis"] = df_patients["diagnosis"].replace(diag_dict)
|
652
|
+
|
653
|
+
# Calculate summary statistics for groups including n
|
654
|
+
summary_patients = df_patients.groupby('group').agg(
|
655
|
+
n=('age', 'size'),
|
656
|
+
age_mean=('age', 'mean'),
|
657
|
+
age_sd=('age', 'std'),
|
658
|
+
|
659
|
+
gender_male=('gender', lambda x: (x == 'male').sum()),
|
660
|
+
gender_female=('gender', lambda x: (x == 'female').sum()),
|
661
|
+
|
662
|
+
education_mean=('education', 'mean'),
|
663
|
+
education_sd=('education', 'std'),
|
664
|
+
|
665
|
+
duration_untreated_mean=('duration_untreated', 'mean'),
|
666
|
+
duration_untreated_sd=('duration_untreated', 'std'),
|
667
|
+
|
668
|
+
age_onset_mean=('age_onset', 'mean'),
|
669
|
+
age_onset_sd=('age_onset', 'std'),
|
670
|
+
|
671
|
+
antipsy_duration_mean=('antipsy_duration', 'mean'),
|
672
|
+
antipsy_duration_sd=('antipsy_duration', 'std'),
|
673
|
+
|
674
|
+
panss_total_mean=('panss_total', 'mean'),
|
675
|
+
panss_total_sd=('panss_total', 'std'),
|
676
|
+
|
677
|
+
mss_total_mean=('mss_total', 'mean'),
|
678
|
+
mss_total_sd=('mss_total', 'std'),
|
679
|
+
|
680
|
+
first_lang_german=('first_language_category', lambda x: (x == 'German').sum()),
|
681
|
+
first_lang_both=('first_language_category', lambda x: (x == 'Both').sum()),
|
682
|
+
first_lang_other=('first_language_category', lambda x: (x == 'Other').sum()),
|
683
|
+
).reset_index()
|
684
|
+
|
685
|
+
summary_controls = df_controls.groupby('group').agg(
|
686
|
+
n=('age', 'size'),
|
687
|
+
age_mean=('age', 'mean'),
|
688
|
+
age_sd=('age', 'std'),
|
689
|
+
|
690
|
+
gender_male=('gender', lambda x: (x == 'male').sum()),
|
691
|
+
gender_female=('gender', lambda x: (x == 'female').sum()),
|
692
|
+
|
693
|
+
education_mean=('education', 'mean'),
|
694
|
+
education_sd=('education', 'std'),
|
695
|
+
|
696
|
+
panss_total_mean=('panss_total', 'mean'),
|
697
|
+
panss_total_sd=('panss_total', 'std'),
|
698
|
+
|
699
|
+
mss_total_mean=('mss_total', 'mean'),
|
700
|
+
mss_total_sd=('mss_total', 'std'),
|
701
|
+
|
702
|
+
first_lang_german=('first_language_category', lambda x: (x == 'German').sum()),
|
703
|
+
first_lang_both=('first_language_category', lambda x: (x == 'Both').sum()),
|
704
|
+
first_lang_other=('first_language_category', lambda x: (x == 'Other').sum()),
|
705
|
+
).reset_index()
|
706
|
+
|
707
|
+
# Calculate percentages for gender
|
708
|
+
summary_patients['gender_male_pct'] = summary_patients['gender_male'] / summary_patients['n'] * 100
|
709
|
+
summary_patients['gender_female_pct'] = summary_patients['gender_female'] / summary_patients['n'] * 100
|
710
|
+
|
711
|
+
summary_controls['gender_male_pct'] = summary_controls['gender_male'] / summary_controls['n'] * 100
|
712
|
+
summary_controls['gender_female_pct'] = summary_controls['gender_female'] / summary_controls['n'] * 100
|
713
|
+
|
714
|
+
# Calculate percentages for first language
|
715
|
+
summary_patients['first_lang_german_pct'] = summary_patients['first_lang_german'] / summary_patients['n'] * 100
|
716
|
+
summary_patients['first_lang_both_pct'] = summary_patients['first_lang_both'] / summary_patients['n'] * 100
|
717
|
+
summary_patients['first_lang_other_pct'] = summary_patients['first_lang_other'] / summary_patients['n'] * 100
|
718
|
+
|
719
|
+
summary_controls['first_lang_german_pct'] = summary_controls['first_lang_german'] / summary_controls['n'] * 100
|
720
|
+
summary_controls['first_lang_both_pct'] = summary_controls['first_lang_both'] / summary_controls['n'] * 100
|
721
|
+
summary_controls['first_lang_other_pct'] = summary_controls['first_lang_other'] / summary_controls['n'] * 100
|
722
|
+
|
723
|
+
# Aggregate diagnosis counts
|
724
|
+
diagnosis_counts = df_patients.groupby(['group', 'diagnosis']).size().unstack(fill_value=0)
|
725
|
+
diagnosis_percent = diagnosis_counts.apply(lambda x: np.round(x * 100 / x.sum(), 2), axis=1) # Normalize counts
|
726
|
+
diagnosis_counts.columns = [f'{col}' for col in diagnosis_counts.columns]
|
727
|
+
diagnosis_counts.reset_index(inplace=True)
|
728
|
+
diagnosis_percent.columns = [f'{col}_p' for col in diagnosis_percent.columns]
|
729
|
+
diagnosis_percent.reset_index(inplace=True)
|
730
|
+
|
731
|
+
# Merge diagnosis counts with summary
|
732
|
+
summary_patients = summary_patients.merge(diagnosis_counts, on='group', how='left').rename(columns=diag_dict).merge(diagnosis_percent, on='group', how='left').rename(columns=diag_dict)
|
733
|
+
|
734
|
+
# Create combined and breakout tables
|
735
|
+
combined_table = format_combined_table(summary_patients, summary_controls)
|
736
|
+
breakout_table = format_breakout_table(summary_patients)
|
737
|
+
|
738
|
+
combined_table['group'] = combined_table['group'].replace(group_dict)
|
739
|
+
breakout_table['group'] = breakout_table['group'].replace(group_dict)
|
740
|
+
|
741
|
+
combined_table.rename({"group": "Group"}, axis=1, inplace=True)
|
742
|
+
breakout_table.rename({"group": "Group"}, axis=1, inplace=True)
|
743
|
+
|
744
|
+
return combined_table, breakout_table
|
745
|
+
|
746
|
+
def main():
|
747
|
+
"""Main execution function."""
|
748
|
+
multivariate = True
|
749
|
+
lower = CONFIG["shared"]["preprocessing"]["lower"]
|
750
|
+
case = "lower" if lower else "upper"
|
751
|
+
|
752
|
+
df_raw = pd.read_csv(
|
753
|
+
CONFIG["aggregation"]["paths"]["output"],
|
754
|
+
index_col=0,
|
755
|
+
dtype=str,
|
756
|
+
)
|
757
|
+
|
758
|
+
df_raw["z_Real_semantic_include0_includeN_8"] = - df_raw["z_Real_semantic_include0_includeN_8"].astype(float)
|
759
|
+
|
760
|
+
df_filtered = df_raw[df_raw["number_tokens"].astype(float) >= CONFIG["min_tokens"]]
|
761
|
+
df_filtered = df_filtered[df_filtered["task"] == CONFIG["task_type"]]
|
762
|
+
exclusions_bev = CONFIG["exclusions_bev"]
|
763
|
+
demographics = CONFIG["stats"]["demographics"]
|
764
|
+
|
765
|
+
metrics = CONFIG["metrics"]
|
766
|
+
new_metrics = CONFIG["new_metrics"]
|
767
|
+
|
768
|
+
bev_cols_quant = CONFIG["stats"]["outcomes"]["clinical"] + CONFIG["stats"]["outcomes"]["cognitive"]
|
769
|
+
|
770
|
+
# Convert numeric columns to float
|
771
|
+
df_filtered[metrics] = df_filtered[metrics].astype(float)
|
772
|
+
df_filtered[bev_cols_quant] = df_filtered[bev_cols_quant].astype(float)
|
773
|
+
numeric_demographics = ['age', 'education']
|
774
|
+
df_filtered[numeric_demographics] = df_filtered[numeric_demographics].astype(float)
|
775
|
+
|
776
|
+
# Get unique columns for aggregation, ensuring we include demographics and group
|
777
|
+
all_cols = list(set(metrics + bev_cols_quant + exclusions_bev + demographics + ['group']))
|
778
|
+
|
779
|
+
# Create aggregation dictionary ensuring no duplicates
|
780
|
+
agg_dict = {}
|
781
|
+
for col in all_cols:
|
782
|
+
if col in metrics:
|
783
|
+
agg_dict[col] = 'mean'
|
784
|
+
else:
|
785
|
+
agg_dict[col] = 'first'
|
786
|
+
|
787
|
+
# Perform groupby aggregation
|
788
|
+
task_means = df_filtered.drop("sub_task", axis=1).groupby("study_id").agg(agg_dict).reset_index()
|
789
|
+
|
790
|
+
normality_results_means = apply_normality_test(task_means)
|
791
|
+
normality_results = apply_normality_test(df_filtered)
|
792
|
+
|
793
|
+
task_means["first_language"] = task_means['first_language'].apply(categorize_first_language)
|
794
|
+
|
795
|
+
metric_residuals, metrics_demographics_results = regress_out_demographics(
|
796
|
+
task_means,
|
797
|
+
outcomes=metrics,
|
798
|
+
demographics=['age', 'gender', 'education', 'first_language'],
|
799
|
+
names=names)
|
800
|
+
|
801
|
+
# Process metrics for cognition using residuals (no group filter)
|
802
|
+
ls_multivar = process_regression_on_residuals(
|
803
|
+
metric_residuals[metric_residuals["group"]!="patient"],
|
804
|
+
metrics, # No need for _resid suffix, these columns are overwritten
|
805
|
+
predictors=['working_memory', 'stroop_inhibition'],
|
806
|
+
names = names,
|
807
|
+
group_filter=None
|
808
|
+
)
|
809
|
+
|
810
|
+
# Process metrics for patients using residuals (filter for patients)
|
811
|
+
pat_multivar = process_regression_on_residuals(
|
812
|
+
metric_residuals,
|
813
|
+
metrics,
|
814
|
+
predictors=['working_memory', 'stroop_inhibition','panss_pos_sum', 'panss_neg_sum'
|
815
|
+
],
|
816
|
+
names = names,
|
817
|
+
group_filter="patient"
|
818
|
+
)
|
819
|
+
|
820
|
+
# Process metrics for healthy subjects using residuals (filter for healthy subjects)
|
821
|
+
hs_multivar = process_regression_on_residuals(
|
822
|
+
metric_residuals,
|
823
|
+
metrics,
|
824
|
+
predictors=['working_memory', 'stroop_inhibition', 'mss_pos_sum', 'mss_neg_sum', 'mss_dis_sum',],
|
825
|
+
names = names,
|
826
|
+
group_filter="hs"
|
827
|
+
)
|
828
|
+
|
829
|
+
corr_control_cog_demo_length_patients = hierarchical_regression_with_vif(task_means[task_means["group"] == "patient"], new_metrics, [
|
830
|
+
'panss_pos_sum', 'panss_neg_sum'
|
831
|
+
], ["stroop_inhibition", "working_memory", "number_tokens"])
|
832
|
+
|
833
|
+
corr_control_cog_demo_patients = hierarchical_regression_with_vif(task_means[task_means["group"] == "patient"], metrics, [
|
834
|
+
'panss_neg_sum',
|
835
|
+
], ["stroop_inhibition", "working_memory"])
|
836
|
+
|
837
|
+
path = CONFIG["stats"]["paths"]["figures_dir"]
|
838
|
+
|
839
|
+
anova_list = []
|
840
|
+
pairwise_list = []
|
841
|
+
|
842
|
+
alpha = 0.05 # Add these missing config values
|
843
|
+
num_tests = 4
|
844
|
+
corrected_alpha = alpha / num_tests
|
845
|
+
n_groups = len(df_filtered["group"].unique())
|
846
|
+
n_subjects = len(df_filtered["study_id"].unique())
|
847
|
+
|
848
|
+
anova, pairwise = compare_groups_single_plot_regression(task_means, metrics, "group", path, scale_data=False, n_comp=len(metrics))
|
849
|
+
anova_list += anova
|
850
|
+
pairwise_list += pairwise
|
851
|
+
|
852
|
+
pairwise_df = pd.DataFrame(pairwise_list)
|
853
|
+
anova_df = pd.DataFrame(anova_list)
|
854
|
+
|
855
|
+
anova_table = anova_df.copy().reset_index(drop=True)[['metric', 'r_squared', 'f_stat', 'p_model', 'Low Sczt mean', 'Low Sczt sd',
|
856
|
+
'High Sczt mean', 'High Sczt sd', 'Psychosis mean', 'Psychosis sd']]
|
857
|
+
|
858
|
+
anova_table['Low Sczt'] = anova_table.apply(lambda row: f"{row['Low Sczt mean']:.2f} ({row['Low Sczt sd']:.2f})", axis=1)
|
859
|
+
anova_table['High Sczt'] = anova_table.apply(lambda row: f"{row['High Sczt mean']:.2f} ({row['High Sczt sd']:.2f})", axis=1)
|
860
|
+
anova_table['Psychosis'] = anova_table.apply(lambda row: f"{row['Psychosis mean']:.2f} ({row['Psychosis sd']:.2f})", axis=1)
|
861
|
+
|
862
|
+
anova_table['r_squared'] = anova_table.apply(lambda row: f"{row['r_squared']:.2f}", axis=1)
|
863
|
+
anova_table['f_stat'] = anova_table.apply(lambda row: f"{row['f_stat']:.2f} ({format_p_value(row['p_model'])})", axis=1)
|
864
|
+
|
865
|
+
# Drop the separate mean and SD columns
|
866
|
+
anova_table = anova_table[['metric', 'r_squared', 'f_stat', 'Low Sczt', 'High Sczt', 'Psychosis']]
|
867
|
+
|
868
|
+
# Rename columns to the final desired names
|
869
|
+
anova_table.columns = ["Metric", "${R}^2$", "F", "Low Sczt (Mean, SD)", "High Sczt (Mean, SD)", "Psychosis (Mean, SD)"]
|
870
|
+
|
871
|
+
pairwise_table = pairwise_df.copy().reset_index(drop=True)
|
872
|
+
pairwise_table.columns = ["Metric", "Comparison", "Mean Diff.", "p adj."]
|
873
|
+
|
874
|
+
df_summary = df_filtered[[
|
875
|
+
"study_id",
|
876
|
+
"group",
|
877
|
+
"age",
|
878
|
+
"gender",
|
879
|
+
"education",
|
880
|
+
"first_language",
|
881
|
+
"diagnosis",
|
882
|
+
'duration_untreated',
|
883
|
+
'age_onset',
|
884
|
+
'antipsy_duration',
|
885
|
+
"panss_total",
|
886
|
+
"mss_total"]].groupby("study_id").first().reset_index()
|
887
|
+
|
888
|
+
formatted_summary, patient_breakout = summary_table(df_summary)
|
889
|
+
|
890
|
+
results = {
|
891
|
+
"corr_df": correlation_apa(task_means, cog_var+metrics, names),
|
892
|
+
"formatted_summary": formatted_summary,
|
893
|
+
"breakout_patients": patient_breakout,
|
894
|
+
"normality_results": normality_results,
|
895
|
+
"normality_results_means": normality_results_means,
|
896
|
+
"demographic_factors_metrics": metrics_demographics_results,
|
897
|
+
"metrics_ls_reg": format_regression_results_apa(ls_multivar, 40),
|
898
|
+
"metrics_ls_reg_dict": ls_multivar,
|
899
|
+
"metrics_pat_reg": format_regression_results_apa(pat_multivar, 40),
|
900
|
+
"metrics_pat_reg_dict": pat_multivar,
|
901
|
+
"metrics_hs_reg": format_regression_results_apa(hs_multivar, 40),
|
902
|
+
"metrics_hs_reg_dict": hs_multivar,
|
903
|
+
|
904
|
+
"stepwise_length": corr_control_cog_demo_length_patients,
|
905
|
+
"stepwise": corr_control_cog_demo_patients,
|
906
|
+
|
907
|
+
"stepwise_length_str": report_stepwise_regression(corr_control_cog_demo_length_patients),
|
908
|
+
"stepwise_str": report_stepwise_regression(corr_control_cog_demo_patients),
|
909
|
+
|
910
|
+
|
911
|
+
"pairwise": pairwise_table,
|
912
|
+
"anova": anova_table,
|
913
|
+
"anova_str": report_categorical_regression(anova_df),
|
914
|
+
"alpha": alpha,
|
915
|
+
"num_tests": num_tests,
|
916
|
+
"corrected_alpha": corrected_alpha,
|
917
|
+
"n_groups": n_groups,
|
918
|
+
"n_subjects": n_subjects}
|
919
|
+
|
920
|
+
with open(os.path.join(CONFIG["stats"]["paths"]["results_dir"], f"stats_results{'_lower' if lower else '_upper'}.pkl"), 'wb') as handle:
|
921
|
+
pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
922
|
+
|
923
|
+
df_filtered.to_csv(os.path.join(CONFIG["stats"]["paths"]["results_dir"], f"filtered_df{'_lower' if lower else '_upper'}.csv"), index=False)
|
924
|
+
df_raw[df_raw["task"] == CONFIG["task_type"]].to_csv(os.path.join(CONFIG["stats"]["paths"]["results_dir"], f"raw_df{'_lower' if lower else '_upper'}.csv"), index=False)
|
925
|
+
task_means.to_csv(os.path.join(CONFIG["stats"]["paths"]["results_dir"], f"task_means{'_lower' if lower else '_upper'}.csv"), index=False)
|
926
|
+
|
927
|
+
return True
|
928
|
+
|
929
|
+
if __name__ == "__main__":
|
930
|
+
main()
|