drugsideeffect 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ include readme.md
2
+ recursive-include drugsideeffect/models *.pkl
@@ -0,0 +1,152 @@
1
+ Metadata-Version: 2.4
2
+ Name: drugsideeffect
3
+ Version: 0.1.4
4
+ Summary: Visualize side effects from textual data
5
+ Home-page: https://github.com/debbdeb/drugsideeffect
6
+ Author: Briti Deb
7
+ Author-email: britideb@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.8
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: pandas>=1.5
13
+ Requires-Dist: numpy>=1.23
14
+ Requires-Dist: matplotlib>=3.6
15
+ Requires-Dist: seaborn>=0.12
16
+ Requires-Dist: plotly>=5.11
17
+ Requires-Dist: nltk>=3.8
18
+ Requires-Dist: textblob>=0.10
19
+ Requires-Dist: spacy>=3.5
20
+ Dynamic: author
21
+ Dynamic: author-email
22
+ Dynamic: classifier
23
+ Dynamic: description
24
+ Dynamic: description-content-type
25
+ Dynamic: home-page
26
+ Dynamic: requires-dist
27
+ Dynamic: requires-python
28
+ Dynamic: summary
29
+
30
+ # Python package sideeffect
31
+
32
+ **sideeffect** is a Python library for processing textual health-related data to extract, analyze, and visualize potential side effects of pharmaceutical drugs from text sources such as social media posts, surveys, or reports.
33
+
34
+ It provides pipeline for:
35
+ - Cleaning and preprocessing text
36
+ - Extracting known and uncommon symptoms
37
+ - Detecting alert-related keywords
38
+ - Generating multiple analytical visualizations
39
+
40
+ ---
41
+
42
+ # Disclaimer
43
+
44
+ **This library is only for informational, educational, and research purposes only.**
45
+ It is **not intended to provide medical advice, diagnosis, or treatment**.
46
+ Do **not** use it as a substitute for professional healthcare guidance.
47
+ Always consult a qualified healthcare professional regarding any medical concerns.
48
+
49
+ Note: The code is provided for demonstration purposes; it is not fully optimized and generates only basic visualizations.
50
+
51
+ ---
52
+
53
+ # Installation from PyPI:
54
+
55
+ %pip install sideeffect
56
+
57
+
58
+ # Quick Start Example
59
+
60
+ import warnings
61
+ warnings.filterwarnings("ignore") # Hide warnings
62
+
63
+ import os
64
+ import pandas as pd
65
+
66
+ # Set working directory
67
+ os.chdir(r"C:\")
68
+
69
+ # Add the project folder to path
70
+ import sys
71
+ sys.path.append(r"C:\")
72
+
73
+ # Import the custom functions
74
+ from sideeffect.processing import main
75
+ from sideeffect.visualization import (
76
+ plot_day_of_week_distribution,
77
+ plot_data_count_per_month,
78
+ plot_known_symptoms,
79
+ plot_uncommon_side_effects_pie_chart,
80
+ plot_proportion_of_english_words,
81
+ plot_create_side_effects_correlation,
82
+ plot_create_side_effects_visualizations,
83
+ plot_sentiment_distribution,
84
+ plot_visualize,
85
+ plot_symptom_extraction,
86
+ )
87
+
88
+
89
+ # -----------------------------
90
+ # Step 1: Load a csv file having at least four columns for example:
91
+ ## Headers: Date, month, RT_Like, text
92
+ ## Row values: 3/15/2023 8:23, 4, 1, Day 2 and I feel better
93
+
94
+
95
+ # -----------------------------
96
+ df = pd.read_csv("data.csv")
97
+
98
+
99
+ # -----------------------------
100
+ # Step 2: Extract symptoms & alert keywords
101
+ # -----------------------------
102
+ # This must be done first so that derived columns like 'extracted_symptoms' exist
103
+ plot_symptom_extraction(df)
104
+
105
+ # -----------------------------
106
+ # Step 3: Basic distributions & sentiment analysis
107
+ # -----------------------------
108
+ plot_day_of_week_distribution(df)
109
+ plot_data_count_per_month(df)
110
+ plot_sentiment_distribution(df)
111
+
112
+ # -----------------------------
113
+ # Step 4: Side effects correlation & detailed visualizations
114
+ # -----------------------------
115
+ plot_create_side_effects_correlation(df)
116
+ plot_create_side_effects_visualizations(df)
117
+ plot_visualize(df)
118
+
119
+ # -----------------------------
120
+ # Step 5: Plots that depend on extracted columns
121
+ # -----------------------------
122
+ plot_known_symptoms(df)
123
+ plot_uncommon_side_effects_pie_chart(df)
124
+ plot_proportion_of_english_words(df)
125
+
126
+ print("All plots generated successfully!")
127
+
128
+
129
+
130
+
131
+ # License
132
+ MIT License
133
+
134
+ Copyright (c) 2026 Briti Deb
135
+
136
+ Permission is hereby granted, free of charge, to any person obtaining a copy
137
+ of this software and associated documentation files (the "Software"), to deal
138
+ in the Software without restriction, including without limitation the rights
139
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
140
+ copies of the Software, and to permit persons to whom the Software is
141
+ furnished to do so, subject to the following conditions:
142
+
143
+ The above copyright notice and this permission notice shall be included in all
144
+ copies or substantial portions of the Software.
145
+
146
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
147
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
148
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
149
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
150
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
151
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
152
+ SOFTWARE.
@@ -0,0 +1,86 @@
1
+ # __init__.py
2
+
3
+ # Import main processing function
4
+ from .processing import main, process_onset_data, classify_sideeffects, clean_text, extract_symptoms, normalize_slang, extract_known_symptoms, extract_uncommon_symptoms, load_and_process_data
5
+
6
+ # Import all visualization functions
7
+ from .visualization import (
8
+ plot_day_of_week_distribution,
9
+ plot_data_count_per_month,
10
+ plot_sentiment_distribution,
11
+ plot_known_symptoms,
12
+ plot_uncommon_side_effects_pie_chart,
13
+ plot_proportion_of_english_words,
14
+ plot_create_side_effects_correlation,
15
+ plot_create_side_effects_visualizations,
16
+ plot_visualize,
17
+ plot_symptom_extraction,
18
+ plot_onset_times
19
+ )
20
+
21
+ # Package version
22
+ __version__ = "0.1.0"
23
+
24
+ # Define all accessible names
25
+ __all__ = [
26
+ "main",
27
+ "process_onset_data",
28
+ "classify_sideeffects",
29
+ "clean_text",
30
+ "extract_symptoms",
31
+ "normalize_slang",
32
+ "extract_known_symptoms",
33
+ "extract_uncommon_symptoms",
34
+ "load_and_process_data",
35
+ "plot_day_of_week_distribution",
36
+ "plot_data_count_per_month",
37
+ "plot_sentiment_distribution",
38
+ "plot_known_symptoms",
39
+ "plot_uncommon_side_effects_pie_chart",
40
+ "plot_proportion_of_english_words",
41
+ "plot_create_side_effects_correlation",
42
+ "plot_create_side_effects_visualizations",
43
+ "plot_visualize",
44
+ "plot_symptom_extraction",
45
+ "plot_onset_times"
46
+ ]
47
+
48
+ # -----------------------------
49
+ # Pipeline function
50
+ # -----------------------------
51
+ def sideeffect(input_file_path):
52
+ """
53
+ Run the full sideeffect analysis and visualization pipeline.
54
+ This includes:
55
+ - Side effect classification
56
+ - Text cleaning
57
+ - Symptom extraction (known/uncommon)
58
+ - Alert keyword detection
59
+ - All plotting functions
60
+ Returns the final DataFrame.
61
+ """
62
+ # Step 1: Load and process CSV
63
+ df = main(input_file_path)
64
+
65
+ # Step 2: Basic distributions
66
+ plot_day_of_week_distribution(df)
67
+ plot_data_count_per_month(df)
68
+ plot_sentiment_distribution(df)
69
+
70
+ # Step 3: Symptom plots
71
+ plot_known_symptoms(df)
72
+ plot_uncommon_side_effects_pie_chart(df)
73
+ plot_proportion_of_english_words(df)
74
+ plot_symptom_extraction(df)
75
+
76
+ # Step 4: Correlation & detailed visualizations
77
+ plot_create_side_effects_correlation(df)
78
+ plot_create_side_effects_visualizations(df)
79
+ plot_visualize(df)
80
+
81
+ # Step 5: Onset time plots
82
+ if 'Date' in df.columns:
83
+ df = process_onset_data(df)
84
+ plot_onset_times(df)
85
+
86
+ return df
@@ -0,0 +1,86 @@
1
+ # __init__.py
2
+
3
+ # Import main processing function
4
+ from .processing import main, process_onset_data, classify_sideeffects, clean_text, extract_symptoms, normalize_slang, extract_known_symptoms, extract_uncommon_symptoms, load_and_process_data
5
+
6
+ # Import all visualization functions
7
+ from .visualization import (
8
+ plot_day_of_week_distribution,
9
+ plot_data_count_per_month,
10
+ plot_sentiment_distribution,
11
+ plot_known_symptoms,
12
+ plot_uncommon_side_effects_pie_chart,
13
+ plot_proportion_of_english_words,
14
+ plot_create_side_effects_correlation,
15
+ plot_create_side_effects_visualizations,
16
+ plot_visualize,
17
+ plot_symptom_extraction,
18
+ plot_onset_times
19
+ )
20
+
21
+ # Package version
22
+ __version__ = "0.1.0"
23
+
24
+ # Define all accessible names
25
+ __all__ = [
26
+ "main",
27
+ "process_onset_data",
28
+ "classify_sideeffects",
29
+ "clean_text",
30
+ "extract_symptoms",
31
+ "normalize_slang",
32
+ "extract_known_symptoms",
33
+ "extract_uncommon_symptoms",
34
+ "load_and_process_data",
35
+ "plot_day_of_week_distribution",
36
+ "plot_data_count_per_month",
37
+ "plot_sentiment_distribution",
38
+ "plot_known_symptoms",
39
+ "plot_uncommon_side_effects_pie_chart",
40
+ "plot_proportion_of_english_words",
41
+ "plot_create_side_effects_correlation",
42
+ "plot_create_side_effects_visualizations",
43
+ "plot_visualize",
44
+ "plot_symptom_extraction",
45
+ "plot_onset_times"
46
+ ]
47
+
48
+ # -----------------------------
49
+ # Pipeline function
50
+ # -----------------------------
51
+ def sideeffect(input_file_path):
52
+ """
53
+ Run the full sideeffect analysis and visualization pipeline.
54
+ This includes:
55
+ - Side effect classification
56
+ - Text cleaning
57
+ - Symptom extraction (known/uncommon)
58
+ - Alert keyword detection
59
+ - All plotting functions
60
+ Returns the final DataFrame.
61
+ """
62
+ # Step 1: Load and process CSV
63
+ df = main(input_file_path)
64
+
65
+ # Step 2: Basic distributions
66
+ plot_day_of_week_distribution(df)
67
+ plot_data_count_per_month(df)
68
+ plot_sentiment_distribution(df)
69
+
70
+ # Step 3: Symptom plots
71
+ plot_known_symptoms(df)
72
+ plot_uncommon_side_effects_pie_chart(df)
73
+ plot_proportion_of_english_words(df)
74
+ plot_symptom_extraction(df)
75
+
76
+ # Step 4: Correlation & detailed visualizations
77
+ plot_create_side_effects_correlation(df)
78
+ plot_create_side_effects_visualizations(df)
79
+ plot_visualize(df)
80
+
81
+ # Step 5: Onset time plots
82
+ if 'Date' in df.columns:
83
+ df = process_onset_data(df)
84
+ plot_onset_times(df)
85
+
86
+ return df
@@ -0,0 +1,283 @@
1
+ # Processing.py
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ import re
6
+ import os
7
+ from textblob import TextBlob
8
+ from datetime import datetime
9
+ import joblib
10
+
11
+ # -----------------------------
12
+ # Load pretrained artifacts
13
+ # -----------------------------
14
+ BASE_DIR = os.path.dirname(__file__)
15
+ MODEL_PATH = os.path.join(BASE_DIR, "models", "sideeffect_nb.pkl")
16
+ VEC_PATH = os.path.join(BASE_DIR, "models", "tfidf_vectorizer.pkl")
17
+
18
+ _model = joblib.load(MODEL_PATH)
19
+ _vectorizer = joblib.load(VEC_PATH)
20
+
21
+ # -----------------------------
22
+ # Side effect classification
23
+ # -----------------------------
24
+ def classify_sideeffects(df):
25
+ """
26
+ Adds a binary prediction column and filters only side effects.
27
+ Prints the number of rows before and after filtering.
28
+ """
29
+ if 'text' not in df.columns:
30
+ raise ValueError("DataFrame must contain 'text' column")
31
+
32
+ initial_rows = len(df)
33
+
34
+ X = df["text"].astype(str)
35
+ X_vec = _vectorizer.transform(X)
36
+
37
+ df["sideeffect_pred"] = _model.predict(X_vec)
38
+ # 1 = sideeffect, 0 = no_sideeffect
39
+
40
+ filtered_df = df[df["sideeffect_pred"] == 1].reset_index(drop=True)
41
+ final_rows = len(filtered_df)
42
+
43
+ print(f"Initial CSV had {initial_rows} rows; after filtering side effects, {final_rows} rows remain.")
44
+
45
+ return filtered_df
46
+
47
+ # -----------------------------
48
+ # Text cleaning
49
+ # -----------------------------
50
+ def clean_text(text):
51
+ """
52
+ Clean raw text by removing URLs, mentions, special characters,
53
+ and extra whitespace.
54
+ """
55
+ if not isinstance(text, str):
56
+ return ""
57
+
58
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text)
59
+ text = re.sub(r"@\w+", "", text)
60
+ text = re.sub(r"[^A-Za-z\s]", "", text)
61
+ text = re.sub(r"\s+", " ", text).strip()
62
+
63
+ return text.lower()
64
+
65
+ # -----------------------------
66
+ # Symptom extraction
67
+ # -----------------------------
68
+ def extract_symptoms(text, lexicon=None):
69
+ """
70
+ Extracts symptoms from text based on a given lexicon.
71
+ If no lexicon is provided, uses default known + uncommon symptoms.
72
+ """
73
+ if lexicon is None:
74
+ lexicon = [
75
+ "spike protein", "diabetes", "vascular", "autoimmune", "p53",
76
+ "t cell", "vitamin d", "contaminated", "zinc", "circumin",
77
+ "ivermectin", "cancer", "myocarditis", "hospital", "outpatient",
78
+ "inpatient", "infection", "bacteria", "fungal", "viral", "sepsis",
79
+ "respiratory", "gynaecology", "dermatology", "ophthalmology",
80
+ "otology", "dental", "hypoxia", "doxycycline", "nanosolver",
81
+ "anticoagulation", "aspirin", "stroke", "heart attack",
82
+ "coronary artery disease", "arrhythmia", "autism",
83
+ "neurodegenerative", "alzheimer", "cognitive",
84
+ "vascular dementia", "parkinson", "immune", "pots", "mcas",
85
+ "insomnia", "new onset dyslipidaemia", "hypertension",
86
+ "cardio-metabolic disturbance", "nervous system",
87
+ "mast cell activity in skin", "post-covid-19 vaccine syndrome",
88
+ "seizure disorders", "migraines", "neuropathy",
89
+ "inflammatory bowel disease", "depression", "anxiety disorders",
90
+ "chronic fatigue syndrome", "lyme disease", "fibromyalgia",
91
+ "arthritis", "chronic obstructive pulmonary disease", "copd",
92
+ "asthma", "chronic kidney disease", "ckd", "chronic heart failure",
93
+ "chf", "bleeding disorders", "atherosclerosis", "vasculopathies",
94
+ "endothelialitis", "thrombosis", "coagulopathy", "long covid",
95
+ "thrombocytopenia", "low platelet", "internal bleeding",
96
+ "lymphopenia", "neutropenia", "suppressed immune", "immune dysfunction",
97
+ "muscle pain", "joint pain", "vomiting", "fever", "autoimmunity",
98
+ "sleep apnea", "guillian barre syndrome", "adem", "cvst",
99
+ "spike amyloids hamper fibrinolysis", "sticky blood", "neuropsychiatric",
100
+ "mrna", "psychosis", "dementia", "schizophrenia", "suicidal",
101
+ "homicidal", "brain clot", "violent behavior", "cognitive decline",
102
+ "delusion", "takotsubo cardiomyopathy", "lipid nanoparticle toxicity",
103
+ "allergenic", "cytotoxic", "pneumonia", "endocrine",
104
+ "immune microclot", "vascular dysfunction", "teamclot",
105
+ "organ impairment", "endothelian diagnostic", "thromboembolic events",
106
+ "inflammatory cytokine increase", "allergic reactions", "igg increase",
107
+ "iga increase"
108
+ ]
109
+
110
+ if not isinstance(text, str):
111
+ return []
112
+ return [s for s in lexicon if s in text.lower()]
113
+
114
+ # -----------------------------
115
+ # Slang normalization
116
+ # -----------------------------
117
+ slang_lexicon = {
118
+ "feel like shit": ["fatigue", "malaise"],
119
+ "exhausted": ["fatigue"],
120
+ "shield against the storm": ["immune response", "general malaise"],
121
+ "i have been run over by a truck": ["muscle pain", "joint pain", "fatigue"],
122
+ "tired": ["fatigue"],
123
+ "knackered": ["fatigue"],
124
+ "wiped out": ["fatigue"],
125
+ "brain fog": ["cognitive"],
126
+ "sleepy all day": ["insomnia", "fatigue"],
127
+ "my head is pounding": ["headache"],
128
+ "can't sleep": ["insomnia"],
129
+ "heart racing": ["arrhythmia"],
130
+ "out of breath": ["respiratory distress", "fatigue"]
131
+ }
132
+
133
+ def normalize_slang(text, slang_lexicon=slang_lexicon):
134
+ """
135
+ Normalize slang terms based on a provided slang lexicon.
136
+ Returns a list of standard symptoms that correspond to slang.
137
+ """
138
+ if not isinstance(text, str):
139
+ return []
140
+
141
+ text_lower = text.lower()
142
+ matched_symptoms = []
143
+
144
+ for slang, symptoms in slang_lexicon.items():
145
+ if slang in text_lower:
146
+ matched_symptoms.extend(symptoms)
147
+
148
+ return list(set(matched_symptoms))
149
+
150
+ # -----------------------------
151
+ # Extract known/uncommon symptoms
152
+ # -----------------------------
153
+ def extract_known_symptoms(text):
154
+ known_symptoms_keywords = [
155
+ "fever", "fatigue", "headache", "muscle pain", "joint pain",
156
+ "vomiting", "insomnia", "cognitive", "anxiety disorders",
157
+ "depression", "respiratory", "asthma", "chronic fatigue syndrome",
158
+ "migraine", "neuropathy", "sleep apnea"
159
+ ]
160
+ if not isinstance(text, str):
161
+ return []
162
+ return [s for s in known_symptoms_keywords if s in text.lower()]
163
+
164
+ def extract_uncommon_symptoms(text):
165
+ uncommon_symptoms_keywords = [
166
+ "myocarditis", "stroke", "heart attack", "coronary artery disease",
167
+ "arrhythmia", "thrombosis", "coagulopathy", "thrombocytopenia",
168
+ "low platelet", "internal bleeding", "lymphopenia", "neutropenia",
169
+ "guillian barre syndrome", "adem", "cvst", "takotsubo cardiomyopathy",
170
+ "lipid nanoparticle toxicity", "brain clot", "psychosis", "schizophrenia",
171
+ "suicidal", "homicidal", "autoimmunity", "vascular dysfunction",
172
+ "immune dysfunction", "organ impairment", "spike amyloids hamper fibrinolysis",
173
+ "sticky blood", "neuropsychiatric", "post-covid-19 vaccine syndrome", "long covid"
174
+ ]
175
+ if not isinstance(text, str):
176
+ return []
177
+ return [s for s in uncommon_symptoms_keywords if s in text.lower()]
178
+
179
+ # -----------------------------
180
+ # Load and process CSV
181
+ # -----------------------------
182
+ def load_and_process_data(input_file_path, text_column="text"):
183
+ df = pd.read_csv(input_file_path)
184
+
185
+ if text_column not in df.columns:
186
+ raise ValueError(f"Column '{text_column}' not found in input file.")
187
+
188
+ df["cleaned_text"] = df[text_column].astype(str).apply(clean_text)
189
+ df["extracted_symptoms"] = df["cleaned_text"].apply(extract_known_symptoms)
190
+ df["extracted_uncommon_symptoms"] = df["cleaned_text"].apply(extract_uncommon_symptoms)
191
+ df["known_symptoms_flag"] = df["extracted_symptoms"].apply(lambda x: int(len(x) > 0))
192
+ df["uncommon_symptoms_flag"] = df["extracted_uncommon_symptoms"].apply(lambda x: int(len(x) > 0))
193
+
194
+ alert_keywords = ["urgent", "emergency", "severe", "critical", "immediate"]
195
+ df["alert_keywords_flag"] = df["cleaned_text"].apply(lambda x: int(any(k in x for k in alert_keywords)))
196
+
197
+ return df
198
+
199
+ # -----------------------------
200
+ # Main pipeline
201
+ # -----------------------------
202
+ def main(csv_path, text_column="text"):
203
+ df = pd.read_csv(csv_path)
204
+
205
+ if text_column not in df.columns:
206
+ raise ValueError(f"CSV must contain a '{text_column}' column")
207
+
208
+ # Step 1: Filter side effects
209
+ df = classify_sideeffects(df)
210
+
211
+ # Step 2: Clean text
212
+ df["cleaned_text"] = df[text_column].astype(str).apply(clean_text)
213
+
214
+ # Step 3: Extract symptoms (full default lexicon)
215
+ df["extracted_symptoms"] = df["cleaned_text"].apply(lambda x: extract_symptoms(x))
216
+ df["extracted_uncommon_symptoms"] = df["cleaned_text"].apply(lambda x: extract_symptoms(x))
217
+
218
+ df["known_symptoms_flag"] = df["extracted_symptoms"].apply(lambda x: int(len(x) > 0))
219
+ df["uncommon_symptoms_flag"] = df["extracted_uncommon_symptoms"].apply(lambda x: int(len(x) > 0))
220
+
221
+ # Step 4: Detect alert keywords
222
+ alert_keywords = ["urgent", "emergency", "severe", "critical", "immediate"]
223
+ df["alert_keywords_flag"] = df["cleaned_text"].apply(lambda x: int(any(k in x for k in alert_keywords)))
224
+
225
+ return df
226
+
227
+ # -----------------------------
228
+ # Onset time processing
229
+ # -----------------------------
230
+ def extract_onset_time(text):
231
+ if not isinstance(text, str):
232
+ return []
233
+ replacements = {
234
+ "a": 1, "few": 2, "couple": 2, "several": 3, "many": 5,
235
+ "dozen": 12, "half": 0.5, "long": 8, "short": 1, "some": 3,
236
+ "next": 24, "last": 24, "immediate": 0, "soon": 1, "this": 1,
237
+ "after": 1, "before": 1, "morning": 6, "afternoon": 6,
238
+ "evening": 6, "night": 8, "week": 168, "month": 730
239
+ }
240
+ time_patterns = [
241
+ r"(\d+|a|few|couple|several|many|dozen|half)\s*(hours?|days?)\s*(post-dose|after\s*vaccination|after\s*shot|after\s*injection|post-vaccine|post\s*jab)",
242
+ r"(\d+|a|few|couple|several|many|dozen|half)\s*(hour|day)\s*(after|since|post|following)"
243
+ ]
244
+ onset_times = []
245
+ for pattern in time_patterns:
246
+ match = re.search(pattern, text, re.IGNORECASE)
247
+ if match:
248
+ value = match.group(1).lower()
249
+ if value in replacements:
250
+ onset_times.append(replacements[value])
251
+ else:
252
+ try:
253
+ onset_times.append(int(value))
254
+ except ValueError:
255
+ continue
256
+ return onset_times
257
+
258
+ def calculate_duration(timestamp, onset_times):
259
+ symptom_duration = []
260
+ if timestamp is pd.NaT or not isinstance(onset_times, list):
261
+ return symptom_duration
262
+ for onset in onset_times:
263
+ if isinstance(onset, (int, float)):
264
+ symptom_duration.append(timestamp + pd.Timedelta(hours=onset))
265
+ return symptom_duration
266
+
267
+ def process_onset_data(df):
268
+ """
269
+ Prepares the DataFrame for onset/duration plotting.
270
+ Adds 'timestamp', 'onset_time', and 'symptom_duration' columns.
271
+ """
272
+ df = df.copy()
273
+ if 'Date' not in df.columns:
274
+ raise ValueError("CSV must contain a 'Date' column")
275
+ if 'text' not in df.columns:
276
+ raise ValueError("DataFrame must contain 'text' column")
277
+
278
+ df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
279
+ df = df.dropna(subset=['Date'])
280
+ df['timestamp'] = df['Date']
281
+ df['onset_time'] = df['text'].apply(extract_onset_time)
282
+ df['symptom_duration'] = df.apply(lambda row: calculate_duration(row['timestamp'], row['onset_time']), axis=1)
283
+ return df