pelican-nlp 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pelican_nlp/Nils_backup/__init__.py +0 -0
  2. pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
  3. pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  4. pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
  5. pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
  6. pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
  7. pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
  8. pelican_nlp/Nils_backup/fluency/config.py +231 -0
  9. pelican_nlp/Nils_backup/fluency/main.py +182 -0
  10. pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
  11. pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
  12. pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
  13. pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
  14. pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
  15. pelican_nlp/Nils_backup/fluency/utils.py +41 -0
  16. pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
  17. pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  18. pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
  19. pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
  20. pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
  21. pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
  22. pelican_nlp/Nils_backup/transcription/test.json +1 -0
  23. pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
  24. pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
  25. pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
  26. pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
  27. pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
  28. pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
  29. pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
  30. pelican_nlp/__init__.py +1 -1
  31. pelican_nlp/_version.py +1 -0
  32. pelican_nlp/configuration_files/config_audio.yml +150 -0
  33. pelican_nlp/configuration_files/config_discourse.yml +104 -0
  34. pelican_nlp/configuration_files/config_fluency.yml +108 -0
  35. pelican_nlp/configuration_files/config_general.yml +131 -0
  36. pelican_nlp/configuration_files/config_morteza.yml +103 -0
  37. pelican_nlp/praat/__init__.py +29 -0
  38. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +4 -3
  39. pelican_nlp-0.1.2.dist-info/RECORD +75 -0
  40. pelican_nlp-0.1.1.dist-info/RECORD +0 -39
  41. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
  42. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
  43. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,930 @@
1
+ import numpy as np
2
+ from scipy.stats import pearsonr, shapiro
3
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
4
+ from sklearn.compose import ColumnTransformer
5
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
6
+ import statsmodels.api as sm
7
+ import pandas as pd
8
+ import pickle
9
+ from plot_fluency import plot_regression_with_levels, plot_regression_coefficients, compare_groups, compare_groups_single_plot_regression, plot_stepwise_regression_results
10
+
11
+ from plotting_utils import (
12
+ GROUP_DICT as group_dict,
13
+ NAMES as names,
14
+ COG_VAR as cog_var,
15
+ format_p_value,
16
+ set_size
17
+ )
18
+
19
+ from config import CONFIG
20
+ import os
21
+
22
+ # Use configuration from config.py
23
+ diag_dict = CONFIG["stats"]["groups"]
24
+
25
+ # Create results directories if they don't exist
26
+ os.makedirs(CONFIG["stats"]["paths"]["results_dir"], exist_ok=True)
27
+ os.makedirs(CONFIG["stats"]["paths"]["figures_dir"], exist_ok=True)
28
+
29
+ def apply_normality_test(df):
30
+ """
31
+ Applies the Shapiro-Wilk normality test to each column of the DataFrame after dropping NaN values.
32
+ Returns a DataFrame with the test statistics and p-values.
33
+ """
34
+ results = {'Column': [], 'Statistic': [], 'P-Value': []}
35
+
36
+ for column in df.columns:
37
+ # Ensure the column is numeric
38
+ if pd.api.types.is_numeric_dtype(df[column]):
39
+ # Drop NaN values
40
+ cleaned_data = df[column].dropna()
41
+ if len(cleaned_data) > 0: # Ensure there is data to test
42
+ stat, p_value = shapiro(cleaned_data)
43
+ results['Column'].append(column)
44
+ results['Statistic'].append(stat)
45
+ results['P-Value'].append(p_value)
46
+ else:
47
+ results['Column'].append(column)
48
+ results['Statistic'].append(None)
49
+ results['P-Value'].append(None)
50
+ else:
51
+ results['Column'].append(column)
52
+ results['Statistic'].append(None)
53
+ results['P-Value'].append(None)
54
+
55
+ return pd.DataFrame(results)
56
+
57
+
58
+ def calculate_vif(X):
59
+ """
60
+ Calculate the VIF for each feature in the DataFrame X.
61
+ """
62
+ vif_data = pd.DataFrame()
63
+ vif_data["Variable"] = X.columns
64
+ vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
65
+ vif_data["Variable"] = vif_data["Variable"].apply(lambda x: names[x])
66
+ return vif_data
67
+
68
+
69
+ def hierarchical_regression_with_vif(data, metrics, scores, control):
70
+ results = []
71
+ scaler = StandardScaler()
72
+
73
+ for score in scores:
74
+ for metric in metrics:
75
+ # Scale the data
76
+ X1_scaled = scaler.fit_transform(data[control])
77
+ y_scaled = scaler.fit_transform(data[[score]])
78
+
79
+ X1 = sm.add_constant(X1_scaled)
80
+ model1 = sm.OLS(y_scaled, X1).fit()
81
+
82
+ X2_list = ([control] if isinstance(control, str) else control) + ([metric] if isinstance(metric, str) else metric)
83
+ X2_scaled = scaler.fit_transform(data[X2_list])
84
+
85
+ X2 = sm.add_constant(X2_scaled)
86
+ model2 = sm.OLS(y_scaled, X2).fit()
87
+
88
+ # Calculate VIF for the full model
89
+ vif = calculate_vif(pd.DataFrame(X2, columns=['const'] + X2_list))
90
+
91
+ r2_adj_change = model2.rsquared_adj - model1.rsquared_adj
92
+ r2_change = model2.rsquared - model1.rsquared
93
+ anova_results = sm.stats.anova_lm(model1, model2)
94
+ f_value = anova_results['F'][1]
95
+ p_value = anova_results['Pr(>F)'][1]
96
+
97
+ df1_control = len(control) # Number of predictors for this outcome
98
+ df2_control = len(data) - df1_control - 1 # Residual degrees of freedom
99
+
100
+ df1_full = len(control) + 1 # Number of predictors for this outcome
101
+ df2_full = len(data) - df1_full - 1 # Residual degrees of freedom
102
+
103
+ results.append({
104
+ 'psychiatric_score': score,
105
+ 'control': control,
106
+ 'metric': metric,
107
+ 'r2_adj_control': model1.rsquared_adj,
108
+ 'r2_adj_full': model2.rsquared_adj,
109
+ 'r2_adj_change': r2_adj_change,
110
+ 'r2_control': model1.rsquared,
111
+ 'df_control': f"({df1_control}, {df2_control})",
112
+ 'df_full': f"({df1_full}, {df2_full})",
113
+ 'r2_full': model2.rsquared,
114
+ 'r2_change': r2_change,
115
+ 'f_value': f_value,
116
+ 'p_value': p_value,
117
+ 'vif': vif.set_index('Variable')['VIF'].to_dict(), # Save VIF as a dictionary
118
+ 'model1': dict(zip(control, model1.params.flatten())), # Convert to a dictionary
119
+ 'model2': dict(zip(X2_list, model2.params.flatten())), # Convert to a dictionary
120
+ })
121
+
122
+ results_df = pd.DataFrame(results)
123
+ return results_df
124
+
125
+ def report_stepwise_regression(df):
126
+ """
127
+ Generate compact APA-style formatted strings from the result of stepwise regression.
128
+
129
+ Parameters:
130
+ - df: DataFrame containing the stepwise regression results.
131
+
132
+ Returns:
133
+ - A list of formatted strings for each row of the DataFrame.
134
+ """
135
+ formatted_strings = {}
136
+
137
+ for index, row in df.iterrows():
138
+ # Extract the relevant data from the row
139
+ metric = row['metric']
140
+ r2_adj_control = row['r2_adj_control'] # Adjusted R² for control model
141
+ r2_adj_full = row['r2_adj_full'] # Adjusted R² for full model
142
+ f_value = row['f_value'] # F statistic
143
+ p_value = row['p_value'] # p-value
144
+ df_control = row['df_control']
145
+ df_full = row['df_full']
146
+ # Format the string in compact APA style
147
+ formatted_strings[names[metric]] = (
148
+ f"R² control {df_control} = {r2_adj_control:.3f}, R² full {df_full} = {r2_adj_full:.3f}, F = {f_value:.2f}, {format_p_value(p_value)}."
149
+ )
150
+
151
+ return formatted_strings
152
+
153
+
154
+ def regress_out_demographics(df, outcomes=None, demographics=None, names=None, group_filter=None):
155
+ """
156
+ Regress out demographic variables from each outcome separately.
157
+ The original outcome columns are scaled and then replaced with residuals,
158
+ while keeping the rest of the DataFrame intact.
159
+
160
+ Parameters:
161
+ - df: Original DataFrame containing the full dataset.
162
+ - outcomes: List of outcome columns to scale and regress out demographics from. Defaults to all outcomes from config.
163
+ - demographics: List of demographic variables. Defaults to demographics from config.
164
+ - names: Dictionary mapping column names to display names
165
+ - group_filter: If specified, regress out demographics within each group separately.
166
+
167
+ Returns:
168
+ - Tuple containing the modified DataFrame with residuals and a dictionary of regression results.
169
+ """
170
+ # Use defaults from config if not provided
171
+ if outcomes is None:
172
+ outcomes = CONFIG["stats"]["outcomes"]["clinical"] + CONFIG["stats"]["outcomes"]["cognitive"]
173
+ if demographics is None:
174
+ demographics = CONFIG["stats"]["demographics"]
175
+
176
+ df_residuals = df.copy()
177
+
178
+ # Initialize placeholders for results
179
+ results = {
180
+ 'coefficients': pd.DataFrame(index=[], columns=outcomes),
181
+ 'ci_lower': pd.DataFrame(index=[], columns=outcomes),
182
+ 'ci_upper': pd.DataFrame(index=[], columns=outcomes),
183
+ 'p_values': pd.DataFrame(index=[], columns=outcomes),
184
+ 'f_statistic': pd.Series(index=outcomes, dtype=float),
185
+ 'r_squared': pd.Series(index=outcomes, dtype=float),
186
+ 'adj_r_squared': pd.Series(index=outcomes, dtype=float),
187
+ 'aic': pd.Series(index=outcomes, dtype=float),
188
+ 'bic': pd.Series(index=outcomes, dtype=float),
189
+ 'model_p_value': pd.Series(index=outcomes, dtype=float)
190
+ }
191
+
192
+ # Identify categorical and numeric demographics
193
+ numeric_features = df[demographics].select_dtypes(include=[np.number]).columns.tolist()
194
+ categorical_features = df[demographics].select_dtypes(exclude=[np.number]).columns.tolist()
195
+
196
+ # Preprocessing pipeline: scale numeric, one-hot encode categorical
197
+ transformers = [('num', StandardScaler(), numeric_features)]
198
+ if categorical_features:
199
+ transformers.append(('cat', OneHotEncoder(drop='first', sparse=False), categorical_features))
200
+
201
+ preprocessor = ColumnTransformer(transformers, remainder='passthrough')
202
+
203
+ # Scale the outcomes explicitly
204
+ outcome_scaler = StandardScaler()
205
+
206
+ if group_filter:
207
+ # Apply group-wise regression for each subgroup
208
+ for group in df[group_filter].unique():
209
+ group_data = df[df[group_filter] == group]
210
+ for outcome in outcomes:
211
+ # Scale the outcome data before regression
212
+ y_scaled = outcome_scaler.fit_transform(group_data[[outcome]]).flatten()
213
+
214
+ X = preprocessor.fit_transform(group_data[demographics])
215
+ X = sm.add_constant(X) # Add constant (intercept)
216
+
217
+ # Extract feature names from the ColumnTransformer
218
+ feature_names = ['const'] + numeric_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
219
+
220
+ # Fit demographic model and calculate residuals for this outcome
221
+ model = sm.OLS(y_scaled, X).fit()
222
+ df_residuals.loc[group_data.index, outcome] = model.resid
223
+
224
+ # Store coefficients, confidence intervals, p-values
225
+ for i, name in enumerate(feature_names):
226
+ if name != 'const': # Ignore the constant for demographics results
227
+ results['coefficients'].loc[name, outcome] = model.params[i]
228
+ results['ci_lower'].loc[name, outcome] = model.conf_int()[i, 0]
229
+ results['ci_upper'].loc[name, outcome] = model.conf_int()[i, 1]
230
+ results['p_values'].loc[name, outcome] = model.pvalues[i]
231
+
232
+ # Store model metrics
233
+ results['f_statistic'].loc[outcome] = model.fvalue
234
+ results['r_squared'].loc[outcome] = model.rsquared
235
+ results['adj_r_squared'].loc[outcome] = model.rsquared_adj
236
+ results['aic'].loc[outcome] = model.aic
237
+ results['bic'].loc[outcome] = model.bic
238
+ results['model_p_value'].loc[outcome] = model.f_pvalue
239
+
240
+ else:
241
+ # Global regression (apply regression for each outcome separately)
242
+ for outcome in outcomes:
243
+ # Scale the outcome data before regression
244
+ y_scaled = outcome_scaler.fit_transform(df[[outcome]]).flatten()
245
+ X = preprocessor.fit_transform(df[demographics])
246
+ X = sm.add_constant(X) # Add constant (intercept)
247
+
248
+ # Extract feature names from the ColumnTransformer
249
+ feature_names = ['const'] + numeric_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
250
+
251
+ # Fit demographic model and calculate residuals for this outcome
252
+ model = sm.OLS(y_scaled, X).fit()
253
+ df_residuals[outcome] = model.resid
254
+
255
+ # Store coefficients, confidence intervals, p-values
256
+ for i, name in enumerate(feature_names):
257
+ if name != 'const': # Ignore the constant for demographics results
258
+ results['coefficients'].loc[name, outcome] = model.params[i]
259
+ results['ci_lower'].loc[name, outcome] = model.conf_int()[i, 0]
260
+ results['ci_upper'].loc[name, outcome] = model.conf_int()[i, 1]
261
+ results['p_values'].loc[name, outcome] = model.pvalues[i]
262
+
263
+ # Store model metrics
264
+ results['f_statistic'].loc[outcome] = model.fvalue
265
+ results['r_squared'].loc[outcome] = model.rsquared
266
+ results['adj_r_squared'].loc[outcome] = model.rsquared_adj
267
+ results['aic'].loc[outcome] = model.aic
268
+ results['bic'].loc[outcome] = model.bic
269
+ results['model_p_value'].loc[outcome] = model.f_pvalue
270
+
271
+ if names:
272
+ for result_df in [results['coefficients'], results['ci_lower'], results['ci_upper'], results['p_values']]:
273
+ result_df.rename(columns=names, index=names, inplace=True)
274
+ for results_series in [results['f_statistic'], results['r_squared'], results['adj_r_squared'], results['aic'], results['bic'], results['model_p_value']]:
275
+ results_series.rename(index=names, inplace=True)
276
+
277
+ # Return the modified DataFrame with residuals and regression results
278
+ return df_residuals, results
279
+
280
+ def process_regression_on_residuals(df, outcome_cols, predictors, names, group_filter=None):
281
+ """
282
+ Perform regression on residuals (which are now in the original outcome columns), rename columns and index,
283
+ and format p-values, optionally filtering by group.
284
+
285
+ Parameters:
286
+ - df: DataFrame containing residuals in the original outcome columns.
287
+ - outcome_cols: List of original outcome columns (now containing residuals).
288
+ - predictors: List of predictor variables.
289
+ - names: Dictionary of names for renaming columns and index.
290
+ - prefix: Optional string prefix to add to formatted column names and dictionary keys.
291
+ - group_filter: If specified, filter the DataFrame by this group before running the regression.
292
+
293
+ Returns:
294
+ - Dictionary containing coefficients, confidence intervals (lower and upper), p-values, formatted results, and other metrics.
295
+ """
296
+
297
+ # Apply the group filter if provided
298
+ if group_filter:
299
+ if 'group' in df.columns:
300
+ df = df[df['group'] == group_filter]
301
+ else:
302
+ raise ValueError(f"The 'group' column does not exist in the DataFrame. Check your group filter.")
303
+
304
+ # Perform the multivariable regression on the residuals
305
+ coeff, coeff_lower, coeff_upper, p_values, f_stat, r_squared, adj_r_squared, aic, bic, model_p_value = multivariable_regression_with_residuals(
306
+ df[outcome_cols], df, predictors
307
+ )
308
+
309
+ # Apply renaming for coefficients, confidence intervals, and p-values
310
+ for result_df in [coeff, coeff_lower, coeff_upper, p_values]:
311
+ result_df.rename(columns=names, index=names, inplace=True)
312
+ for results_series in [f_stat, r_squared, adj_r_squared, aic, bic, model_p_value]:
313
+ results_series.rename(index=names, inplace=True)
314
+
315
+ # Create the result dictionary with dynamic keys
316
+ result_dict = {
317
+ "coefficients": coeff,
318
+ "ci_lower": coeff_lower,
319
+ "ci_upper": coeff_upper,
320
+ "p_values": p_values,
321
+ "f_statistic": f_stat,
322
+ "r_squared": r_squared,
323
+ "adj_r_squared": adj_r_squared,
324
+ "aic": aic,
325
+ "bic": bic,
326
+ "model_p_value": model_p_value
327
+ }
328
+
329
+ return result_dict
330
+
331
+ def multivariable_regression_with_residuals(residuals, df, predictors):
332
+ """
333
+ Perform multivariable regression using residuals as outcomes.
334
+ Handles scaling of numeric predictors.
335
+
336
+ Parameters:
337
+ - residuals: DataFrame of residuals from regressing out demographics.
338
+ - df: The original DataFrame containing all data, including predictors.
339
+ - predictors: List of numeric predictors for the multivariable regression.
340
+
341
+ Returns:
342
+ - DataFrames of coefficients, confidence intervals, p-values, F-statistics, R-squared, Adjusted R-squared, and overall p-values.
343
+ - Dictionary containing the preprocessed predictor matrix (X_preprocessed) for each outcome.
344
+ """
345
+ results = {
346
+ 'coefficients': pd.DataFrame(index=predictors, columns=residuals.columns),
347
+ 'ci_lower': pd.DataFrame(index=predictors, columns=residuals.columns),
348
+ 'ci_upper': pd.DataFrame(index=predictors, columns=residuals.columns),
349
+ 'p_values': pd.DataFrame(index=predictors, columns=residuals.columns),
350
+ 'f_statistic': pd.Series(index=residuals.columns),
351
+ 'r_squared': pd.Series(index=residuals.columns),
352
+ 'adj_r_squared': pd.Series(index=residuals.columns),
353
+ 'aic': pd.Series(index=residuals.columns),
354
+ 'bic': pd.Series(index=residuals.columns),
355
+ 'model_p_value': pd.Series(index=residuals.columns) # New addition for overall p-values
356
+ }
357
+
358
+ for outcome in residuals.columns:
359
+ # Combine residuals and predictors to ensure they are aligned
360
+ combined = pd.concat([residuals[outcome], df[predictors]], axis=1).dropna()
361
+
362
+ # Check if combined data has sufficient rows
363
+ if combined.shape[0] < len(predictors) + 1:
364
+ print(f"Warning: Not enough data points to perform regression for outcome {outcome}. Skipping...")
365
+ continue
366
+
367
+ # Separate the aligned residuals and predictors
368
+ y = combined[outcome]
369
+ X = combined[predictors]
370
+
371
+ # Scale the numeric predictors
372
+ scaler = StandardScaler()
373
+ X_scaled = scaler.fit_transform(X)
374
+
375
+ # Add a constant (intercept) term
376
+ X_scaled = sm.add_constant(X_scaled)
377
+
378
+ # Fit the regression model
379
+ model = sm.OLS(y, X_scaled).fit()
380
+
381
+ # Map the model's parameter names (e.g., x1, x2) back to the original predictor names
382
+ params = model.params
383
+ conf_int = model.conf_int()
384
+ pvalues = model.pvalues
385
+ # Store coefficients, confidence intervals, and p-values (excluding the constant term)
386
+ for i, predictor in enumerate(predictors):
387
+ results['coefficients'].loc[predictor, outcome] = params[i+1] # Skip constant
388
+ results['ci_lower'].loc[predictor, outcome] = conf_int.iloc[i+1, 0]
389
+ results['ci_upper'].loc[predictor, outcome] = conf_int.iloc[i+1, 1]
390
+ results['p_values'].loc[predictor, outcome] = pvalues[i+1] # Skip constant
391
+
392
+ # Store model metrics
393
+ results['f_statistic'].loc[outcome] = model.fvalue
394
+ results['r_squared'].loc[outcome] = model.rsquared
395
+ results['adj_r_squared'].loc[outcome] = model.rsquared_adj
396
+ results['aic'].loc[outcome] = model.aic
397
+ results['bic'].loc[outcome] = model.bic
398
+
399
+ # Calculate the degrees of freedom
400
+ df_model = len(predictors) # Number of predictors
401
+ df_resid = model.df_resid # Residual degrees of freedom
402
+
403
+ # Calculate the overall p-value using the F-distribution
404
+ overall_p_value = model.f_pvalue
405
+ results['model_p_value'].loc[outcome] = overall_p_value
406
+
407
+ return (results['coefficients'].T,
408
+ results['ci_lower'].T,
409
+ results['ci_upper'].T,
410
+ results['p_values'].T,
411
+ results['f_statistic'],
412
+ results['r_squared'],
413
+ results['adj_r_squared'],
414
+ results['aic'],
415
+ results['bic'],
416
+ results['model_p_value'])
417
+
418
+ def format_regression_results_apa(results_dict, n):
419
+ """
420
+ Format regression results into an APA-style dictionary for multiple outcome measures.
421
+
422
+ Parameters:
423
+ - results_dict: Dictionary containing regression results, including coefficients, confidence intervals, F-statistic, R-squared, overall p-value, etc.
424
+ - n: The number of observations (sample size) used in the regression.
425
+
426
+ Returns:
427
+ - A dictionary with outcome names as keys, where each key contains another dictionary with the model summary
428
+ and APA-formatted strings for each variable for that outcome.
429
+ """
430
+
431
+ # Create a dictionary to store the output for multiple outcomes
432
+ final_result_dict = {}
433
+
434
+ # Iterate over each outcome in the result dictionary
435
+ for outcome in results_dict["f_statistic"].index:
436
+
437
+ # Extract necessary metrics for the current outcome
438
+ coefficients = results_dict["coefficients"].loc[outcome]
439
+ ci_lower = results_dict["ci_lower"].loc[outcome]
440
+ ci_upper = results_dict["ci_upper"].loc[outcome]
441
+ f_stat = results_dict["f_statistic"].loc[outcome] # F-statistic for the current outcome
442
+ r_squared = results_dict["r_squared"].loc[outcome] # R-squared for the current outcome
443
+ adj_r_squared = results_dict["adj_r_squared"].loc[outcome] # Adjusted R-squared for the current outcome
444
+ p_values = results_dict["p_values"].loc[outcome]
445
+ overall_p_value = results_dict["model_p_value"].loc[outcome] # Overall p-value for the current outcome
446
+ # Assuming df1 is the number of predictors and df2 is the number of observations minus predictors minus 1
447
+ df1 = len(coefficients) # Number of predictors for this outcome
448
+ df2 = n - df1 - 1 # Residual degrees of freedom
449
+
450
+ # Overall regression stats (APA formatted)
451
+ summary = (
452
+ f"R² = {r_squared:.2f}, F({df1}, {df2}) = {f_stat:.2f}, {format_p_value(overall_p_value)}"
453
+ )
454
+
455
+ # Create a dictionary for the current outcome to store the summary and coefficient details
456
+ outcome_result_dict = {"summary": summary}
457
+
458
+ for predictor in results_dict["coefficients"].columns:
459
+ coeff = results_dict["coefficients"].loc[outcome,predictor] # Extract scalar values
460
+ ci_l = results_dict["ci_lower"].loc[outcome, predictor] # Extract scalar values
461
+ ci_u = results_dict["ci_upper"].loc[outcome, predictor] # Extract scalar values
462
+ p_val = results_dict["p_values"].loc[outcome, predictor] # Extract scalar values
463
+
464
+ # APA style for variable coefficients with confidence intervals
465
+ outcome_result_dict[predictor] = (
466
+ f"β = {coeff:.2f}, 95% CI [{ci_l:.2f}, {ci_u:.2f}], {format_p_value(p_val)}"
467
+ )
468
+
469
+ # Store the result for this outcome in the final dictionary
470
+ final_result_dict[outcome] = outcome_result_dict
471
+
472
+ return final_result_dict
473
+
474
+
475
+ def correlation_apa(df, columns, names):
476
+ """
477
+ Compute correlations between the specified columns in a DataFrame and return results in APA style,
478
+ while renaming the index and columns using the `names` dictionary.
479
+
480
+ Parameters:
481
+ - df: pandas DataFrame containing the data.
482
+ - columns: List of column names to compute correlations for.
483
+ - names: Dictionary for renaming the index and columns.
484
+
485
+ Returns:
486
+ - A pandas DataFrame with APA-formatted correlation results.
487
+ """
488
+
489
+ # Prepare an empty DataFrame to store results
490
+ apa_corr = pd.DataFrame(index=columns, columns=columns)
491
+
492
+ # Compute correlations for each pair of columns
493
+ for i, col1 in enumerate(columns):
494
+ for j, col2 in enumerate(columns):
495
+ # Compute Pearson correlation
496
+ r, p_value = pearsonr(df[col1], df[col2])
497
+
498
+ # Degrees of freedom
499
+ n = df[[col1, col2]].dropna().shape[0] # Exclude missing data
500
+ df_corr = n - 2
501
+
502
+ # Format in APA style: r(df) = r_value, p = p_value
503
+ apa_corr.loc[col1, col2] = f"r ({df_corr}) = {r:.2f}, {format_p_value(p_value)}"
504
+
505
+ # Apply the renaming using the `names` dictionary
506
+ apa_corr.rename(index=names, columns=names, inplace=True)
507
+
508
+ return apa_corr
509
+
510
+ def report_categorical_regression(df):
511
+ """
512
+ Generate compact APA-style formatted strings from a categorical univariable regression table.
513
+
514
+ Parameters:
515
+ - df: DataFrame containing the regression results.
516
+
517
+ Returns:
518
+ - A list of compact formatted strings for each metric.
519
+ """
520
+ formatted_strings = {}
521
+
522
+ for metric in df['metric'].unique():
523
+ # Filter rows for the current metric
524
+ metric_df = df[df['metric'] == metric]
525
+
526
+ # Extract intercept information
527
+ intercept_row = metric_df[metric_df['group'] == 'Intercept'].iloc[0]
528
+ intercept = intercept_row['coef']
529
+ r_squared = intercept_row['r_squared']
530
+ model_p = format_p_value(intercept_row['p_model'])
531
+ dof = intercept_row['df']
532
+ f_stat = intercept_row['f_stat']
533
+ # Start constructing the compact APA-style string
534
+ formatted_string = f"R² = {r_squared:.2f}, F {dof} = {f_stat:.2f} {model_p}; Intercept = {intercept:.2f}, "
535
+
536
+ # Loop through each group comparison (excluding the intercept)
537
+ group_strings = []
538
+ for _, row in metric_df[metric_df['group'] != 'Intercept'].iterrows():
539
+ group = row['group'].split('[')[-1].strip(']')[2:]
540
+ coef = row['coef']
541
+ p_value = row['p']
542
+ group_strings.append(f"{group} = {coef:.2f}, {format_p_value(p_value)}")
543
+
544
+ # Combine group results and add the R² value
545
+ formatted_string += "; ".join(group_strings)
546
+
547
+ # Append the formatted string to the list
548
+ formatted_strings[metric] = formatted_string
549
+
550
+ return formatted_strings
551
+
552
+ def categorize_first_language(lang):
553
+ """Categorize first language into 'German', 'Both', or 'Other'."""
554
+ if lang.lower() == 'german':
555
+ return 'German'
556
+ elif lang.lower() == 'both':
557
+ return 'German'
558
+ else:
559
+ return 'Other'
560
+
561
+ def format_combined_table(summary_patients, summary_controls):
562
+ # Prepare shared variables in combined table
563
+ combined_df = summary_patients[['group', 'n']].copy()
564
+ combined_df['Age, mean (SD)'] = summary_patients.apply(lambda x: f"{x['age_mean']:.2f} ({x['age_sd']:.2f})", axis=1)
565
+ combined_df['Gender, Male'] = summary_patients.apply(lambda x: f"{x['gender_male']} ({x['gender_male_pct']:.2f}%)", axis=1)
566
+ combined_df['Gender, Female'] = summary_patients.apply(lambda x: f"{x['gender_female']} ({x['gender_female_pct']:.2f}%)", axis=1)
567
+ combined_df['Education Years, mean (SD)'] = summary_patients.apply(lambda x: f"{x['education_mean']:.2f} ({x['education_sd']:.2f})", axis=1)
568
+
569
+
570
+ # Add first language counts and percentages
571
+ combined_df['1st Language: German'] = summary_patients.apply(lambda x: f"{x['first_lang_german']} ({x['first_lang_german_pct']:.2f}%)", axis=1)
572
+ combined_df['1st Language: Bilingual'] = summary_patients.apply(lambda x: f"{x['first_lang_both']} ({x['first_lang_both_pct']:.2f}%)", axis=1)
573
+ combined_df['1st Language: Other'] = summary_patients.apply(lambda x: f"{x['first_lang_other']} ({x['first_lang_other_pct']:.2f}%)", axis=1)
574
+
575
+ # Add PANSS for patients and MSS for controls
576
+ combined_df['PANSS, mean (SD)'] = summary_patients.apply(lambda x: f"{x['panss_total_mean']:.2f} ({x['panss_total_sd']:.2f})", axis=1)
577
+ combined_df['MSS, mean (SD)'] = summary_patients.apply(lambda x: f"{x['mss_total_mean']:.2f} ({x['mss_total_sd']:.2f})", axis=1)
578
+
579
+ # Add controls to the combined table
580
+ combined_df_controls = summary_controls[['group', 'n']].copy()
581
+
582
+ combined_df_controls['Age, mean (SD)'] = summary_controls.apply(lambda x: f"{x['age_mean']:.2f} ({x['age_sd']:.2f})", axis=1)
583
+ combined_df_controls['Gender, Male'] = summary_controls.apply(lambda x: f"{x['gender_male']} ({x['gender_male_pct']:.2f}%)", axis=1)
584
+ combined_df_controls['Gender, Female'] = summary_controls.apply(lambda x: f"{x['gender_female']} ({x['gender_female_pct']:.2f}%)", axis=1)
585
+
586
+ combined_df_controls['Education Years, mean (SD)'] = summary_controls.apply(lambda x: f"{x['education_mean']:.2f} ({x['education_sd']:.2f})", axis=1)
587
+
588
+ combined_df_controls['1st Language: German'] = summary_controls.apply(lambda x: f"{x['first_lang_german']} ({x['first_lang_german_pct']:.2f}%)", axis=1)
589
+ combined_df_controls['1st Language: Bilingual'] = summary_controls.apply(lambda x: f"{x['first_lang_both']} ({x['first_lang_both_pct']:.2f}%)", axis=1)
590
+ combined_df_controls['1st Language: Other'] = summary_controls.apply(lambda x: f"{x['first_lang_other']} ({x['first_lang_other_pct']:.2f}%)", axis=1)
591
+
592
+ # Add PANSS for patients and MSS for controls
593
+ combined_df_controls['PANSS, mean (SD)'] = summary_controls.apply(lambda x: f"{x['panss_total_mean']:.2f} ({x['panss_total_sd']:.2f})", axis=1)
594
+ combined_df_controls['MSS, mean (SD)'] = summary_controls.apply(lambda x: f"{x['mss_total_mean']:.2f} ({x['mss_total_sd']:.2f})", axis=1)
595
+
596
+
597
+ combined_table = pd.concat([combined_df, combined_df_controls], ignore_index=True)
598
+
599
+ return combined_table
600
+
601
+
602
+ def format_breakout_table(df_patients):
603
+ breakout_df = df_patients[['group']].copy()
604
+
605
+
606
+ breakout_df['Duration Untreated, mean (SD)'] = df_patients.apply(lambda x: f"{x['duration_untreated_mean']:.2f} ({x['duration_untreated_sd']:.2f})", axis=1)
607
+ breakout_df['Age of Onset, mean (SD)'] = df_patients.apply(lambda x: f"{x['age_onset_mean']:.2f} ({x['age_onset_sd']:.2f})", axis=1)
608
+ breakout_df['Antipsychotic Treatment (weeks), mean (SD)'] = df_patients.apply(lambda x: f"{x['antipsy_duration_mean']:.2f} ({x['antipsy_duration_sd']:.2f})", axis=1)
609
+ breakout_df["**Diagnosis:**"] = ""
610
+ # Add diagnosis percentages
611
+ diagnosis_cols = ['Schizophrenia', 'Brief psychotic disorder', 'Schizoaffective disorders', 'MDD with psychotic symptoms', 'other']
612
+ for col in diagnosis_cols:
613
+ if col in df_patients.columns:
614
+ breakout_df[col] = df_patients[col].apply(lambda x: '-' if x == 0 else f"{x:.1f}%" if isinstance(x, (int, float)) else x)
615
+
616
+ return breakout_df
617
+
618
+ def summary_table(df):
619
+ # Categorize the first language column
620
+ df['first_language_category'] = df['first_language'].apply(categorize_first_language)
621
+
622
+ # Convert numeric columns to float
623
+ numeric_cols = ['age', 'education', 'duration_untreated', 'age_onset', 'antipsy_duration', 'panss_total', 'mss_total']
624
+ df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
625
+
626
+ # Data preparation
627
+ df_patients = df[df["group"] == "patient"].groupby("study_id")[[
628
+ "group",
629
+ "age",
630
+ "gender",
631
+ "education",
632
+ "first_language_category",
633
+ "diagnosis",
634
+ 'duration_untreated',
635
+ 'age_onset',
636
+ 'antipsy_duration',
637
+ "panss_total",
638
+ "mss_total"
639
+ ]].first().reset_index().drop("study_id", axis=1)
640
+
641
+ df_controls = df[df["group"] != "patient"].groupby("study_id")[[
642
+ "group",
643
+ "age",
644
+ "gender",
645
+ "education",
646
+ "first_language_category",
647
+ "panss_total",
648
+ "mss_total"
649
+ ]].first().reset_index().drop("study_id", axis=1)
650
+
651
+ df_patients["diagnosis"] = df_patients["diagnosis"].replace(diag_dict)
652
+
653
+ # Calculate summary statistics for groups including n
654
+ summary_patients = df_patients.groupby('group').agg(
655
+ n=('age', 'size'),
656
+ age_mean=('age', 'mean'),
657
+ age_sd=('age', 'std'),
658
+
659
+ gender_male=('gender', lambda x: (x == 'male').sum()),
660
+ gender_female=('gender', lambda x: (x == 'female').sum()),
661
+
662
+ education_mean=('education', 'mean'),
663
+ education_sd=('education', 'std'),
664
+
665
+ duration_untreated_mean=('duration_untreated', 'mean'),
666
+ duration_untreated_sd=('duration_untreated', 'std'),
667
+
668
+ age_onset_mean=('age_onset', 'mean'),
669
+ age_onset_sd=('age_onset', 'std'),
670
+
671
+ antipsy_duration_mean=('antipsy_duration', 'mean'),
672
+ antipsy_duration_sd=('antipsy_duration', 'std'),
673
+
674
+ panss_total_mean=('panss_total', 'mean'),
675
+ panss_total_sd=('panss_total', 'std'),
676
+
677
+ mss_total_mean=('mss_total', 'mean'),
678
+ mss_total_sd=('mss_total', 'std'),
679
+
680
+ first_lang_german=('first_language_category', lambda x: (x == 'German').sum()),
681
+ first_lang_both=('first_language_category', lambda x: (x == 'Both').sum()),
682
+ first_lang_other=('first_language_category', lambda x: (x == 'Other').sum()),
683
+ ).reset_index()
684
+
685
+ summary_controls = df_controls.groupby('group').agg(
686
+ n=('age', 'size'),
687
+ age_mean=('age', 'mean'),
688
+ age_sd=('age', 'std'),
689
+
690
+ gender_male=('gender', lambda x: (x == 'male').sum()),
691
+ gender_female=('gender', lambda x: (x == 'female').sum()),
692
+
693
+ education_mean=('education', 'mean'),
694
+ education_sd=('education', 'std'),
695
+
696
+ panss_total_mean=('panss_total', 'mean'),
697
+ panss_total_sd=('panss_total', 'std'),
698
+
699
+ mss_total_mean=('mss_total', 'mean'),
700
+ mss_total_sd=('mss_total', 'std'),
701
+
702
+ first_lang_german=('first_language_category', lambda x: (x == 'German').sum()),
703
+ first_lang_both=('first_language_category', lambda x: (x == 'Both').sum()),
704
+ first_lang_other=('first_language_category', lambda x: (x == 'Other').sum()),
705
+ ).reset_index()
706
+
707
+ # Calculate percentages for gender
708
+ summary_patients['gender_male_pct'] = summary_patients['gender_male'] / summary_patients['n'] * 100
709
+ summary_patients['gender_female_pct'] = summary_patients['gender_female'] / summary_patients['n'] * 100
710
+
711
+ summary_controls['gender_male_pct'] = summary_controls['gender_male'] / summary_controls['n'] * 100
712
+ summary_controls['gender_female_pct'] = summary_controls['gender_female'] / summary_controls['n'] * 100
713
+
714
+ # Calculate percentages for first language
715
+ summary_patients['first_lang_german_pct'] = summary_patients['first_lang_german'] / summary_patients['n'] * 100
716
+ summary_patients['first_lang_both_pct'] = summary_patients['first_lang_both'] / summary_patients['n'] * 100
717
+ summary_patients['first_lang_other_pct'] = summary_patients['first_lang_other'] / summary_patients['n'] * 100
718
+
719
+ summary_controls['first_lang_german_pct'] = summary_controls['first_lang_german'] / summary_controls['n'] * 100
720
+ summary_controls['first_lang_both_pct'] = summary_controls['first_lang_both'] / summary_controls['n'] * 100
721
+ summary_controls['first_lang_other_pct'] = summary_controls['first_lang_other'] / summary_controls['n'] * 100
722
+
723
+ # Aggregate diagnosis counts
724
+ diagnosis_counts = df_patients.groupby(['group', 'diagnosis']).size().unstack(fill_value=0)
725
+ diagnosis_percent = diagnosis_counts.apply(lambda x: np.round(x * 100 / x.sum(), 2), axis=1) # Normalize counts
726
+ diagnosis_counts.columns = [f'{col}' for col in diagnosis_counts.columns]
727
+ diagnosis_counts.reset_index(inplace=True)
728
+ diagnosis_percent.columns = [f'{col}_p' for col in diagnosis_percent.columns]
729
+ diagnosis_percent.reset_index(inplace=True)
730
+
731
+ # Merge diagnosis counts with summary
732
+ summary_patients = summary_patients.merge(diagnosis_counts, on='group', how='left').rename(columns=diag_dict).merge(diagnosis_percent, on='group', how='left').rename(columns=diag_dict)
733
+
734
+ # Create combined and breakout tables
735
+ combined_table = format_combined_table(summary_patients, summary_controls)
736
+ breakout_table = format_breakout_table(summary_patients)
737
+
738
+ combined_table['group'] = combined_table['group'].replace(group_dict)
739
+ breakout_table['group'] = breakout_table['group'].replace(group_dict)
740
+
741
+ combined_table.rename({"group": "Group"}, axis=1, inplace=True)
742
+ breakout_table.rename({"group": "Group"}, axis=1, inplace=True)
743
+
744
+ return combined_table, breakout_table
745
+
746
+ def main():
747
+ """Main execution function."""
748
+ multivariate = True
749
+ lower = CONFIG["shared"]["preprocessing"]["lower"]
750
+ case = "lower" if lower else "upper"
751
+
752
+ df_raw = pd.read_csv(
753
+ CONFIG["aggregation"]["paths"]["output"],
754
+ index_col=0,
755
+ dtype=str,
756
+ )
757
+
758
+ df_raw["z_Real_semantic_include0_includeN_8"] = - df_raw["z_Real_semantic_include0_includeN_8"].astype(float)
759
+
760
+ df_filtered = df_raw[df_raw["number_tokens"].astype(float) >= CONFIG["min_tokens"]]
761
+ df_filtered = df_filtered[df_filtered["task"] == CONFIG["task_type"]]
762
+ exclusions_bev = CONFIG["exclusions_bev"]
763
+ demographics = CONFIG["stats"]["demographics"]
764
+
765
+ metrics = CONFIG["metrics"]
766
+ new_metrics = CONFIG["new_metrics"]
767
+
768
+ bev_cols_quant = CONFIG["stats"]["outcomes"]["clinical"] + CONFIG["stats"]["outcomes"]["cognitive"]
769
+
770
+ # Convert numeric columns to float
771
+ df_filtered[metrics] = df_filtered[metrics].astype(float)
772
+ df_filtered[bev_cols_quant] = df_filtered[bev_cols_quant].astype(float)
773
+ numeric_demographics = ['age', 'education']
774
+ df_filtered[numeric_demographics] = df_filtered[numeric_demographics].astype(float)
775
+
776
+ # Get unique columns for aggregation, ensuring we include demographics and group
777
+ all_cols = list(set(metrics + bev_cols_quant + exclusions_bev + demographics + ['group']))
778
+
779
+ # Create aggregation dictionary ensuring no duplicates
780
+ agg_dict = {}
781
+ for col in all_cols:
782
+ if col in metrics:
783
+ agg_dict[col] = 'mean'
784
+ else:
785
+ agg_dict[col] = 'first'
786
+
787
+ # Perform groupby aggregation
788
+ task_means = df_filtered.drop("sub_task", axis=1).groupby("study_id").agg(agg_dict).reset_index()
789
+
790
+ normality_results_means = apply_normality_test(task_means)
791
+ normality_results = apply_normality_test(df_filtered)
792
+
793
+ task_means["first_language"] = task_means['first_language'].apply(categorize_first_language)
794
+
795
+ metric_residuals, metrics_demographics_results = regress_out_demographics(
796
+ task_means,
797
+ outcomes=metrics,
798
+ demographics=['age', 'gender', 'education', 'first_language'],
799
+ names=names)
800
+
801
+ # Process metrics for cognition using residuals (no group filter)
802
+ ls_multivar = process_regression_on_residuals(
803
+ metric_residuals[metric_residuals["group"]!="patient"],
804
+ metrics, # No need for _resid suffix, these columns are overwritten
805
+ predictors=['working_memory', 'stroop_inhibition'],
806
+ names = names,
807
+ group_filter=None
808
+ )
809
+
810
+ # Process metrics for patients using residuals (filter for patients)
811
+ pat_multivar = process_regression_on_residuals(
812
+ metric_residuals,
813
+ metrics,
814
+ predictors=['working_memory', 'stroop_inhibition','panss_pos_sum', 'panss_neg_sum'
815
+ ],
816
+ names = names,
817
+ group_filter="patient"
818
+ )
819
+
820
+ # Process metrics for healthy subjects using residuals (filter for healthy subjects)
821
+ hs_multivar = process_regression_on_residuals(
822
+ metric_residuals,
823
+ metrics,
824
+ predictors=['working_memory', 'stroop_inhibition', 'mss_pos_sum', 'mss_neg_sum', 'mss_dis_sum',],
825
+ names = names,
826
+ group_filter="hs"
827
+ )
828
+
829
+ corr_control_cog_demo_length_patients = hierarchical_regression_with_vif(task_means[task_means["group"] == "patient"], new_metrics, [
830
+ 'panss_pos_sum', 'panss_neg_sum'
831
+ ], ["stroop_inhibition", "working_memory", "number_tokens"])
832
+
833
+ corr_control_cog_demo_patients = hierarchical_regression_with_vif(task_means[task_means["group"] == "patient"], metrics, [
834
+ 'panss_neg_sum',
835
+ ], ["stroop_inhibition", "working_memory"])
836
+
837
+ path = CONFIG["stats"]["paths"]["figures_dir"]
838
+
839
+ anova_list = []
840
+ pairwise_list = []
841
+
842
+ alpha = 0.05 # Add these missing config values
843
+ num_tests = 4
844
+ corrected_alpha = alpha / num_tests
845
+ n_groups = len(df_filtered["group"].unique())
846
+ n_subjects = len(df_filtered["study_id"].unique())
847
+
848
+ anova, pairwise = compare_groups_single_plot_regression(task_means, metrics, "group", path, scale_data=False, n_comp=len(metrics))
849
+ anova_list += anova
850
+ pairwise_list += pairwise
851
+
852
+ pairwise_df = pd.DataFrame(pairwise_list)
853
+ anova_df = pd.DataFrame(anova_list)
854
+
855
+ anova_table = anova_df.copy().reset_index(drop=True)[['metric', 'r_squared', 'f_stat', 'p_model', 'Low Sczt mean', 'Low Sczt sd',
856
+ 'High Sczt mean', 'High Sczt sd', 'Psychosis mean', 'Psychosis sd']]
857
+
858
+ anova_table['Low Sczt'] = anova_table.apply(lambda row: f"{row['Low Sczt mean']:.2f} ({row['Low Sczt sd']:.2f})", axis=1)
859
+ anova_table['High Sczt'] = anova_table.apply(lambda row: f"{row['High Sczt mean']:.2f} ({row['High Sczt sd']:.2f})", axis=1)
860
+ anova_table['Psychosis'] = anova_table.apply(lambda row: f"{row['Psychosis mean']:.2f} ({row['Psychosis sd']:.2f})", axis=1)
861
+
862
+ anova_table['r_squared'] = anova_table.apply(lambda row: f"{row['r_squared']:.2f}", axis=1)
863
+ anova_table['f_stat'] = anova_table.apply(lambda row: f"{row['f_stat']:.2f} ({format_p_value(row['p_model'])})", axis=1)
864
+
865
+ # Drop the separate mean and SD columns
866
+ anova_table = anova_table[['metric', 'r_squared', 'f_stat', 'Low Sczt', 'High Sczt', 'Psychosis']]
867
+
868
+ # Rename columns to the final desired names
869
+ anova_table.columns = ["Metric", "${R}^2$", "F", "Low Sczt (Mean, SD)", "High Sczt (Mean, SD)", "Psychosis (Mean, SD)"]
870
+
871
+ pairwise_table = pairwise_df.copy().reset_index(drop=True)
872
+ pairwise_table.columns = ["Metric", "Comparison", "Mean Diff.", "p adj."]
873
+
874
+ df_summary = df_filtered[[
875
+ "study_id",
876
+ "group",
877
+ "age",
878
+ "gender",
879
+ "education",
880
+ "first_language",
881
+ "diagnosis",
882
+ 'duration_untreated',
883
+ 'age_onset',
884
+ 'antipsy_duration',
885
+ "panss_total",
886
+ "mss_total"]].groupby("study_id").first().reset_index()
887
+
888
+ formatted_summary, patient_breakout = summary_table(df_summary)
889
+
890
+ results = {
891
+ "corr_df": correlation_apa(task_means, cog_var+metrics, names),
892
+ "formatted_summary": formatted_summary,
893
+ "breakout_patients": patient_breakout,
894
+ "normality_results": normality_results,
895
+ "normality_results_means": normality_results_means,
896
+ "demographic_factors_metrics": metrics_demographics_results,
897
+ "metrics_ls_reg": format_regression_results_apa(ls_multivar, 40),
898
+ "metrics_ls_reg_dict": ls_multivar,
899
+ "metrics_pat_reg": format_regression_results_apa(pat_multivar, 40),
900
+ "metrics_pat_reg_dict": pat_multivar,
901
+ "metrics_hs_reg": format_regression_results_apa(hs_multivar, 40),
902
+ "metrics_hs_reg_dict": hs_multivar,
903
+
904
+ "stepwise_length": corr_control_cog_demo_length_patients,
905
+ "stepwise": corr_control_cog_demo_patients,
906
+
907
+ "stepwise_length_str": report_stepwise_regression(corr_control_cog_demo_length_patients),
908
+ "stepwise_str": report_stepwise_regression(corr_control_cog_demo_patients),
909
+
910
+
911
+ "pairwise": pairwise_table,
912
+ "anova": anova_table,
913
+ "anova_str": report_categorical_regression(anova_df),
914
+ "alpha": alpha,
915
+ "num_tests": num_tests,
916
+ "corrected_alpha": corrected_alpha,
917
+ "n_groups": n_groups,
918
+ "n_subjects": n_subjects}
919
+
920
+ with open(os.path.join(CONFIG["stats"]["paths"]["results_dir"], f"stats_results{'_lower' if lower else '_upper'}.pkl"), 'wb') as handle:
921
+ pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
922
+
923
+ df_filtered.to_csv(os.path.join(CONFIG["stats"]["paths"]["results_dir"], f"filtered_df{'_lower' if lower else '_upper'}.csv"), index=False)
924
+ df_raw[df_raw["task"] == CONFIG["task_type"]].to_csv(os.path.join(CONFIG["stats"]["paths"]["results_dir"], f"raw_df{'_lower' if lower else '_upper'}.csv"), index=False)
925
+ task_means.to_csv(os.path.join(CONFIG["stats"]["paths"]["results_dir"], f"task_means{'_lower' if lower else '_upper'}.csv"), index=False)
926
+
927
+ return True
928
+
929
+ if __name__ == "__main__":
930
+ main()