drugsideeffect 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- drugsideeffect/__init__.py +86 -0
- drugsideeffect/drugsideeffect/__init__.py +86 -0
- drugsideeffect/drugsideeffect/models/__init__.py +0 -0
- drugsideeffect/drugsideeffect/processing.py +283 -0
- drugsideeffect/drugsideeffect/sideeffect_pipeline.py +89 -0
- drugsideeffect/drugsideeffect/visualization.py +1700 -0
- drugsideeffect/models/__init__.py +0 -0
- drugsideeffect/models/sideeffect_nb.pkl +0 -0
- drugsideeffect/models/tfidf_vectorizer.pkl +0 -0
- drugsideeffect/processing.py +283 -0
- drugsideeffect/setup.py +37 -0
- drugsideeffect/sideeffect_pipeline.py +89 -0
- drugsideeffect/visualization.py +1700 -0
- drugsideeffect-0.1.4.dist-info/METADATA +152 -0
- drugsideeffect-0.1.4.dist-info/RECORD +17 -0
- drugsideeffect-0.1.4.dist-info/WHEEL +5 -0
- drugsideeffect-0.1.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# __init__.py
|
|
2
|
+
|
|
3
|
+
# Import main processing function
|
|
4
|
+
from .processing import main, process_onset_data, classify_sideeffects, clean_text, extract_symptoms, normalize_slang, extract_known_symptoms, extract_uncommon_symptoms, load_and_process_data
|
|
5
|
+
|
|
6
|
+
# Import all visualization functions
|
|
7
|
+
from .visualization import (
|
|
8
|
+
plot_day_of_week_distribution,
|
|
9
|
+
plot_data_count_per_month,
|
|
10
|
+
plot_sentiment_distribution,
|
|
11
|
+
plot_known_symptoms,
|
|
12
|
+
plot_uncommon_side_effects_pie_chart,
|
|
13
|
+
plot_proportion_of_english_words,
|
|
14
|
+
plot_create_side_effects_correlation,
|
|
15
|
+
plot_create_side_effects_visualizations,
|
|
16
|
+
plot_visualize,
|
|
17
|
+
plot_symptom_extraction,
|
|
18
|
+
plot_onset_times
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Package version
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
|
|
24
|
+
# Define all accessible names
|
|
25
|
+
__all__ = [
|
|
26
|
+
"main",
|
|
27
|
+
"process_onset_data",
|
|
28
|
+
"classify_sideeffects",
|
|
29
|
+
"clean_text",
|
|
30
|
+
"extract_symptoms",
|
|
31
|
+
"normalize_slang",
|
|
32
|
+
"extract_known_symptoms",
|
|
33
|
+
"extract_uncommon_symptoms",
|
|
34
|
+
"load_and_process_data",
|
|
35
|
+
"plot_day_of_week_distribution",
|
|
36
|
+
"plot_data_count_per_month",
|
|
37
|
+
"plot_sentiment_distribution",
|
|
38
|
+
"plot_known_symptoms",
|
|
39
|
+
"plot_uncommon_side_effects_pie_chart",
|
|
40
|
+
"plot_proportion_of_english_words",
|
|
41
|
+
"plot_create_side_effects_correlation",
|
|
42
|
+
"plot_create_side_effects_visualizations",
|
|
43
|
+
"plot_visualize",
|
|
44
|
+
"plot_symptom_extraction",
|
|
45
|
+
"plot_onset_times"
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# -----------------------------
|
|
49
|
+
# Pipeline function
|
|
50
|
+
# -----------------------------
|
|
51
|
+
def sideeffect(input_file_path):
|
|
52
|
+
"""
|
|
53
|
+
Run the full sideeffect analysis and visualization pipeline.
|
|
54
|
+
This includes:
|
|
55
|
+
- Side effect classification
|
|
56
|
+
- Text cleaning
|
|
57
|
+
- Symptom extraction (known/uncommon)
|
|
58
|
+
- Alert keyword detection
|
|
59
|
+
- All plotting functions
|
|
60
|
+
Returns the final DataFrame.
|
|
61
|
+
"""
|
|
62
|
+
# Step 1: Load and process CSV
|
|
63
|
+
df = main(input_file_path)
|
|
64
|
+
|
|
65
|
+
# Step 2: Basic distributions
|
|
66
|
+
plot_day_of_week_distribution(df)
|
|
67
|
+
plot_data_count_per_month(df)
|
|
68
|
+
plot_sentiment_distribution(df)
|
|
69
|
+
|
|
70
|
+
# Step 3: Symptom plots
|
|
71
|
+
plot_known_symptoms(df)
|
|
72
|
+
plot_uncommon_side_effects_pie_chart(df)
|
|
73
|
+
plot_proportion_of_english_words(df)
|
|
74
|
+
plot_symptom_extraction(df)
|
|
75
|
+
|
|
76
|
+
# Step 4: Correlation & detailed visualizations
|
|
77
|
+
plot_create_side_effects_correlation(df)
|
|
78
|
+
plot_create_side_effects_visualizations(df)
|
|
79
|
+
plot_visualize(df)
|
|
80
|
+
|
|
81
|
+
# Step 5: Onset time plots
|
|
82
|
+
if 'Date' in df.columns:
|
|
83
|
+
df = process_onset_data(df)
|
|
84
|
+
plot_onset_times(df)
|
|
85
|
+
|
|
86
|
+
return df
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# __init__.py
|
|
2
|
+
|
|
3
|
+
# Import main processing function
|
|
4
|
+
from .processing import main, process_onset_data, classify_sideeffects, clean_text, extract_symptoms, normalize_slang, extract_known_symptoms, extract_uncommon_symptoms, load_and_process_data
|
|
5
|
+
|
|
6
|
+
# Import all visualization functions
|
|
7
|
+
from .visualization import (
|
|
8
|
+
plot_day_of_week_distribution,
|
|
9
|
+
plot_data_count_per_month,
|
|
10
|
+
plot_sentiment_distribution,
|
|
11
|
+
plot_known_symptoms,
|
|
12
|
+
plot_uncommon_side_effects_pie_chart,
|
|
13
|
+
plot_proportion_of_english_words,
|
|
14
|
+
plot_create_side_effects_correlation,
|
|
15
|
+
plot_create_side_effects_visualizations,
|
|
16
|
+
plot_visualize,
|
|
17
|
+
plot_symptom_extraction,
|
|
18
|
+
plot_onset_times
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Package version
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
|
|
24
|
+
# Define all accessible names
|
|
25
|
+
__all__ = [
|
|
26
|
+
"main",
|
|
27
|
+
"process_onset_data",
|
|
28
|
+
"classify_sideeffects",
|
|
29
|
+
"clean_text",
|
|
30
|
+
"extract_symptoms",
|
|
31
|
+
"normalize_slang",
|
|
32
|
+
"extract_known_symptoms",
|
|
33
|
+
"extract_uncommon_symptoms",
|
|
34
|
+
"load_and_process_data",
|
|
35
|
+
"plot_day_of_week_distribution",
|
|
36
|
+
"plot_data_count_per_month",
|
|
37
|
+
"plot_sentiment_distribution",
|
|
38
|
+
"plot_known_symptoms",
|
|
39
|
+
"plot_uncommon_side_effects_pie_chart",
|
|
40
|
+
"plot_proportion_of_english_words",
|
|
41
|
+
"plot_create_side_effects_correlation",
|
|
42
|
+
"plot_create_side_effects_visualizations",
|
|
43
|
+
"plot_visualize",
|
|
44
|
+
"plot_symptom_extraction",
|
|
45
|
+
"plot_onset_times"
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# -----------------------------
|
|
49
|
+
# Pipeline function
|
|
50
|
+
# -----------------------------
|
|
51
|
+
def sideeffect(input_file_path):
|
|
52
|
+
"""
|
|
53
|
+
Run the full sideeffect analysis and visualization pipeline.
|
|
54
|
+
This includes:
|
|
55
|
+
- Side effect classification
|
|
56
|
+
- Text cleaning
|
|
57
|
+
- Symptom extraction (known/uncommon)
|
|
58
|
+
- Alert keyword detection
|
|
59
|
+
- All plotting functions
|
|
60
|
+
Returns the final DataFrame.
|
|
61
|
+
"""
|
|
62
|
+
# Step 1: Load and process CSV
|
|
63
|
+
df = main(input_file_path)
|
|
64
|
+
|
|
65
|
+
# Step 2: Basic distributions
|
|
66
|
+
plot_day_of_week_distribution(df)
|
|
67
|
+
plot_data_count_per_month(df)
|
|
68
|
+
plot_sentiment_distribution(df)
|
|
69
|
+
|
|
70
|
+
# Step 3: Symptom plots
|
|
71
|
+
plot_known_symptoms(df)
|
|
72
|
+
plot_uncommon_side_effects_pie_chart(df)
|
|
73
|
+
plot_proportion_of_english_words(df)
|
|
74
|
+
plot_symptom_extraction(df)
|
|
75
|
+
|
|
76
|
+
# Step 4: Correlation & detailed visualizations
|
|
77
|
+
plot_create_side_effects_correlation(df)
|
|
78
|
+
plot_create_side_effects_visualizations(df)
|
|
79
|
+
plot_visualize(df)
|
|
80
|
+
|
|
81
|
+
# Step 5: Onset time plots
|
|
82
|
+
if 'Date' in df.columns:
|
|
83
|
+
df = process_onset_data(df)
|
|
84
|
+
plot_onset_times(df)
|
|
85
|
+
|
|
86
|
+
return df
|
|
File without changes
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# Processing.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
import re
|
|
6
|
+
import os
|
|
7
|
+
from textblob import TextBlob
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
import joblib
|
|
10
|
+
|
|
11
|
+
# -----------------------------
|
|
12
|
+
# Load pretrained artifacts
|
|
13
|
+
# -----------------------------
|
|
14
|
+
BASE_DIR = os.path.dirname(__file__)
|
|
15
|
+
MODEL_PATH = os.path.join(BASE_DIR, "models", "sideeffect_nb.pkl")
|
|
16
|
+
VEC_PATH = os.path.join(BASE_DIR, "models", "tfidf_vectorizer.pkl")
|
|
17
|
+
|
|
18
|
+
_model = joblib.load(MODEL_PATH)
|
|
19
|
+
_vectorizer = joblib.load(VEC_PATH)
|
|
20
|
+
|
|
21
|
+
# -----------------------------
|
|
22
|
+
# Side effect classification
|
|
23
|
+
# -----------------------------
|
|
24
|
+
def classify_sideeffects(df):
|
|
25
|
+
"""
|
|
26
|
+
Adds a binary prediction column and filters only side effects.
|
|
27
|
+
Prints the number of rows before and after filtering.
|
|
28
|
+
"""
|
|
29
|
+
if 'text' not in df.columns:
|
|
30
|
+
raise ValueError("DataFrame must contain 'text' column")
|
|
31
|
+
|
|
32
|
+
initial_rows = len(df)
|
|
33
|
+
|
|
34
|
+
X = df["text"].astype(str)
|
|
35
|
+
X_vec = _vectorizer.transform(X)
|
|
36
|
+
|
|
37
|
+
df["sideeffect_pred"] = _model.predict(X_vec)
|
|
38
|
+
# 1 = sideeffect, 0 = no_sideeffect
|
|
39
|
+
|
|
40
|
+
filtered_df = df[df["sideeffect_pred"] == 1].reset_index(drop=True)
|
|
41
|
+
final_rows = len(filtered_df)
|
|
42
|
+
|
|
43
|
+
print(f"Initial CSV had {initial_rows} rows; after filtering side effects, {final_rows} rows remain.")
|
|
44
|
+
|
|
45
|
+
return filtered_df
|
|
46
|
+
|
|
47
|
+
# -----------------------------
|
|
48
|
+
# Text cleaning
|
|
49
|
+
# -----------------------------
|
|
50
|
+
def clean_text(text):
|
|
51
|
+
"""
|
|
52
|
+
Clean raw text by removing URLs, mentions, special characters,
|
|
53
|
+
and extra whitespace.
|
|
54
|
+
"""
|
|
55
|
+
if not isinstance(text, str):
|
|
56
|
+
return ""
|
|
57
|
+
|
|
58
|
+
text = re.sub(r"http\S+|www\S+|https\S+", "", text)
|
|
59
|
+
text = re.sub(r"@\w+", "", text)
|
|
60
|
+
text = re.sub(r"[^A-Za-z\s]", "", text)
|
|
61
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
62
|
+
|
|
63
|
+
return text.lower()
|
|
64
|
+
|
|
65
|
+
# -----------------------------
|
|
66
|
+
# Symptom extraction
|
|
67
|
+
# -----------------------------
|
|
68
|
+
def extract_symptoms(text, lexicon=None):
|
|
69
|
+
"""
|
|
70
|
+
Extracts symptoms from text based on a given lexicon.
|
|
71
|
+
If no lexicon is provided, uses default known + uncommon symptoms.
|
|
72
|
+
"""
|
|
73
|
+
if lexicon is None:
|
|
74
|
+
lexicon = [
|
|
75
|
+
"spike protein", "diabetes", "vascular", "autoimmune", "p53",
|
|
76
|
+
"t cell", "vitamin d", "contaminated", "zinc", "circumin",
|
|
77
|
+
"ivermectin", "cancer", "myocarditis", "hospital", "outpatient",
|
|
78
|
+
"inpatient", "infection", "bacteria", "fungal", "viral", "sepsis",
|
|
79
|
+
"respiratory", "gynaecology", "dermatology", "ophthalmology",
|
|
80
|
+
"otology", "dental", "hypoxia", "doxycycline", "nanosolver",
|
|
81
|
+
"anticoagulation", "aspirin", "stroke", "heart attack",
|
|
82
|
+
"coronary artery disease", "arrhythmia", "autism",
|
|
83
|
+
"neurodegenerative", "alzheimer", "cognitive",
|
|
84
|
+
"vascular dementia", "parkinson", "immune", "pots", "mcas",
|
|
85
|
+
"insomnia", "new onset dyslipidaemia", "hypertension",
|
|
86
|
+
"cardio-metabolic disturbance", "nervous system",
|
|
87
|
+
"mast cell activity in skin", "post-covid-19 vaccine syndrome",
|
|
88
|
+
"seizure disorders", "migraines", "neuropathy",
|
|
89
|
+
"inflammatory bowel disease", "depression", "anxiety disorders",
|
|
90
|
+
"chronic fatigue syndrome", "lyme disease", "fibromyalgia",
|
|
91
|
+
"arthritis", "chronic obstructive pulmonary disease", "copd",
|
|
92
|
+
"asthma", "chronic kidney disease", "ckd", "chronic heart failure",
|
|
93
|
+
"chf", "bleeding disorders", "atherosclerosis", "vasculopathies",
|
|
94
|
+
"endothelialitis", "thrombosis", "coagulopathy", "long covid",
|
|
95
|
+
"thrombocytopenia", "low platelet", "internal bleeding",
|
|
96
|
+
"lymphopenia", "neutropenia", "suppressed immune", "immune dysfunction",
|
|
97
|
+
"muscle pain", "joint pain", "vomiting", "fever", "autoimmunity",
|
|
98
|
+
"sleep apnea", "guillian barre syndrome", "adem", "cvst",
|
|
99
|
+
"spike amyloids hamper fibrinolysis", "sticky blood", "neuropsychiatric",
|
|
100
|
+
"mrna", "psychosis", "dementia", "schizophrenia", "suicidal",
|
|
101
|
+
"homicidal", "brain clot", "violent behavior", "cognitive decline",
|
|
102
|
+
"delusion", "takotsubo cardiomyopathy", "lipid nanoparticle toxicity",
|
|
103
|
+
"allergenic", "cytotoxic", "pneumonia", "endocrine",
|
|
104
|
+
"immune microclot", "vascular dysfunction", "teamclot",
|
|
105
|
+
"organ impairment", "endothelian diagnostic", "thromboembolic events",
|
|
106
|
+
"inflammatory cytokine increase", "allergic reactions", "igg increase",
|
|
107
|
+
"iga increase"
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
if not isinstance(text, str):
|
|
111
|
+
return []
|
|
112
|
+
return [s for s in lexicon if s in text.lower()]
|
|
113
|
+
|
|
114
|
+
# -----------------------------
|
|
115
|
+
# Slang normalization
|
|
116
|
+
# -----------------------------
|
|
117
|
+
slang_lexicon = {
|
|
118
|
+
"feel like shit": ["fatigue", "malaise"],
|
|
119
|
+
"exhausted": ["fatigue"],
|
|
120
|
+
"shield against the storm": ["immune response", "general malaise"],
|
|
121
|
+
"i have been run over by a truck": ["muscle pain", "joint pain", "fatigue"],
|
|
122
|
+
"tired": ["fatigue"],
|
|
123
|
+
"knackered": ["fatigue"],
|
|
124
|
+
"wiped out": ["fatigue"],
|
|
125
|
+
"brain fog": ["cognitive"],
|
|
126
|
+
"sleepy all day": ["insomnia", "fatigue"],
|
|
127
|
+
"my head is pounding": ["headache"],
|
|
128
|
+
"can't sleep": ["insomnia"],
|
|
129
|
+
"heart racing": ["arrhythmia"],
|
|
130
|
+
"out of breath": ["respiratory distress", "fatigue"]
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
def normalize_slang(text, slang_lexicon=slang_lexicon):
|
|
134
|
+
"""
|
|
135
|
+
Normalize slang terms based on a provided slang lexicon.
|
|
136
|
+
Returns a list of standard symptoms that correspond to slang.
|
|
137
|
+
"""
|
|
138
|
+
if not isinstance(text, str):
|
|
139
|
+
return []
|
|
140
|
+
|
|
141
|
+
text_lower = text.lower()
|
|
142
|
+
matched_symptoms = []
|
|
143
|
+
|
|
144
|
+
for slang, symptoms in slang_lexicon.items():
|
|
145
|
+
if slang in text_lower:
|
|
146
|
+
matched_symptoms.extend(symptoms)
|
|
147
|
+
|
|
148
|
+
return list(set(matched_symptoms))
|
|
149
|
+
|
|
150
|
+
# -----------------------------
|
|
151
|
+
# Extract known/uncommon symptoms
|
|
152
|
+
# -----------------------------
|
|
153
|
+
def extract_known_symptoms(text):
|
|
154
|
+
known_symptoms_keywords = [
|
|
155
|
+
"fever", "fatigue", "headache", "muscle pain", "joint pain",
|
|
156
|
+
"vomiting", "insomnia", "cognitive", "anxiety disorders",
|
|
157
|
+
"depression", "respiratory", "asthma", "chronic fatigue syndrome",
|
|
158
|
+
"migraine", "neuropathy", "sleep apnea"
|
|
159
|
+
]
|
|
160
|
+
if not isinstance(text, str):
|
|
161
|
+
return []
|
|
162
|
+
return [s for s in known_symptoms_keywords if s in text.lower()]
|
|
163
|
+
|
|
164
|
+
def extract_uncommon_symptoms(text):
|
|
165
|
+
uncommon_symptoms_keywords = [
|
|
166
|
+
"myocarditis", "stroke", "heart attack", "coronary artery disease",
|
|
167
|
+
"arrhythmia", "thrombosis", "coagulopathy", "thrombocytopenia",
|
|
168
|
+
"low platelet", "internal bleeding", "lymphopenia", "neutropenia",
|
|
169
|
+
"guillian barre syndrome", "adem", "cvst", "takotsubo cardiomyopathy",
|
|
170
|
+
"lipid nanoparticle toxicity", "brain clot", "psychosis", "schizophrenia",
|
|
171
|
+
"suicidal", "homicidal", "autoimmunity", "vascular dysfunction",
|
|
172
|
+
"immune dysfunction", "organ impairment", "spike amyloids hamper fibrinolysis",
|
|
173
|
+
"sticky blood", "neuropsychiatric", "post-covid-19 vaccine syndrome", "long covid"
|
|
174
|
+
]
|
|
175
|
+
if not isinstance(text, str):
|
|
176
|
+
return []
|
|
177
|
+
return [s for s in uncommon_symptoms_keywords if s in text.lower()]
|
|
178
|
+
|
|
179
|
+
# -----------------------------
|
|
180
|
+
# Load and process CSV
|
|
181
|
+
# -----------------------------
|
|
182
|
+
def load_and_process_data(input_file_path, text_column="text"):
|
|
183
|
+
df = pd.read_csv(input_file_path)
|
|
184
|
+
|
|
185
|
+
if text_column not in df.columns:
|
|
186
|
+
raise ValueError(f"Column '{text_column}' not found in input file.")
|
|
187
|
+
|
|
188
|
+
df["cleaned_text"] = df[text_column].astype(str).apply(clean_text)
|
|
189
|
+
df["extracted_symptoms"] = df["cleaned_text"].apply(extract_known_symptoms)
|
|
190
|
+
df["extracted_uncommon_symptoms"] = df["cleaned_text"].apply(extract_uncommon_symptoms)
|
|
191
|
+
df["known_symptoms_flag"] = df["extracted_symptoms"].apply(lambda x: int(len(x) > 0))
|
|
192
|
+
df["uncommon_symptoms_flag"] = df["extracted_uncommon_symptoms"].apply(lambda x: int(len(x) > 0))
|
|
193
|
+
|
|
194
|
+
alert_keywords = ["urgent", "emergency", "severe", "critical", "immediate"]
|
|
195
|
+
df["alert_keywords_flag"] = df["cleaned_text"].apply(lambda x: int(any(k in x for k in alert_keywords)))
|
|
196
|
+
|
|
197
|
+
return df
|
|
198
|
+
|
|
199
|
+
# -----------------------------
|
|
200
|
+
# Main pipeline
|
|
201
|
+
# -----------------------------
|
|
202
|
+
def main(csv_path, text_column="text"):
|
|
203
|
+
df = pd.read_csv(csv_path)
|
|
204
|
+
|
|
205
|
+
if text_column not in df.columns:
|
|
206
|
+
raise ValueError(f"CSV must contain a '{text_column}' column")
|
|
207
|
+
|
|
208
|
+
# Step 1: Filter side effects
|
|
209
|
+
df = classify_sideeffects(df)
|
|
210
|
+
|
|
211
|
+
# Step 2: Clean text
|
|
212
|
+
df["cleaned_text"] = df[text_column].astype(str).apply(clean_text)
|
|
213
|
+
|
|
214
|
+
# Step 3: Extract symptoms (full default lexicon)
|
|
215
|
+
df["extracted_symptoms"] = df["cleaned_text"].apply(lambda x: extract_symptoms(x))
|
|
216
|
+
df["extracted_uncommon_symptoms"] = df["cleaned_text"].apply(lambda x: extract_symptoms(x))
|
|
217
|
+
|
|
218
|
+
df["known_symptoms_flag"] = df["extracted_symptoms"].apply(lambda x: int(len(x) > 0))
|
|
219
|
+
df["uncommon_symptoms_flag"] = df["extracted_uncommon_symptoms"].apply(lambda x: int(len(x) > 0))
|
|
220
|
+
|
|
221
|
+
# Step 4: Detect alert keywords
|
|
222
|
+
alert_keywords = ["urgent", "emergency", "severe", "critical", "immediate"]
|
|
223
|
+
df["alert_keywords_flag"] = df["cleaned_text"].apply(lambda x: int(any(k in x for k in alert_keywords)))
|
|
224
|
+
|
|
225
|
+
return df
|
|
226
|
+
|
|
227
|
+
# -----------------------------
|
|
228
|
+
# Onset time processing
|
|
229
|
+
# -----------------------------
|
|
230
|
+
def extract_onset_time(text):
|
|
231
|
+
if not isinstance(text, str):
|
|
232
|
+
return []
|
|
233
|
+
replacements = {
|
|
234
|
+
"a": 1, "few": 2, "couple": 2, "several": 3, "many": 5,
|
|
235
|
+
"dozen": 12, "half": 0.5, "long": 8, "short": 1, "some": 3,
|
|
236
|
+
"next": 24, "last": 24, "immediate": 0, "soon": 1, "this": 1,
|
|
237
|
+
"after": 1, "before": 1, "morning": 6, "afternoon": 6,
|
|
238
|
+
"evening": 6, "night": 8, "week": 168, "month": 730
|
|
239
|
+
}
|
|
240
|
+
time_patterns = [
|
|
241
|
+
r"(\d+|a|few|couple|several|many|dozen|half)\s*(hours?|days?)\s*(post-dose|after\s*vaccination|after\s*shot|after\s*injection|post-vaccine|post\s*jab)",
|
|
242
|
+
r"(\d+|a|few|couple|several|many|dozen|half)\s*(hour|day)\s*(after|since|post|following)"
|
|
243
|
+
]
|
|
244
|
+
onset_times = []
|
|
245
|
+
for pattern in time_patterns:
|
|
246
|
+
match = re.search(pattern, text, re.IGNORECASE)
|
|
247
|
+
if match:
|
|
248
|
+
value = match.group(1).lower()
|
|
249
|
+
if value in replacements:
|
|
250
|
+
onset_times.append(replacements[value])
|
|
251
|
+
else:
|
|
252
|
+
try:
|
|
253
|
+
onset_times.append(int(value))
|
|
254
|
+
except ValueError:
|
|
255
|
+
continue
|
|
256
|
+
return onset_times
|
|
257
|
+
|
|
258
|
+
def calculate_duration(timestamp, onset_times):
|
|
259
|
+
symptom_duration = []
|
|
260
|
+
if timestamp is pd.NaT or not isinstance(onset_times, list):
|
|
261
|
+
return symptom_duration
|
|
262
|
+
for onset in onset_times:
|
|
263
|
+
if isinstance(onset, (int, float)):
|
|
264
|
+
symptom_duration.append(timestamp + pd.Timedelta(hours=onset))
|
|
265
|
+
return symptom_duration
|
|
266
|
+
|
|
267
|
+
def process_onset_data(df):
|
|
268
|
+
"""
|
|
269
|
+
Prepares the DataFrame for onset/duration plotting.
|
|
270
|
+
Adds 'timestamp', 'onset_time', and 'symptom_duration' columns.
|
|
271
|
+
"""
|
|
272
|
+
df = df.copy()
|
|
273
|
+
if 'Date' not in df.columns:
|
|
274
|
+
raise ValueError("CSV must contain a 'Date' column")
|
|
275
|
+
if 'text' not in df.columns:
|
|
276
|
+
raise ValueError("DataFrame must contain 'text' column")
|
|
277
|
+
|
|
278
|
+
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
|
|
279
|
+
df = df.dropna(subset=['Date'])
|
|
280
|
+
df['timestamp'] = df['Date']
|
|
281
|
+
df['onset_time'] = df['text'].apply(extract_onset_time)
|
|
282
|
+
df['symptom_duration'] = df.apply(lambda row: calculate_duration(row['timestamp'], row['onset_time']), axis=1)
|
|
283
|
+
return df
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
def sideeffect_pipeline(
|
|
2
|
+
csv_file,
|
|
3
|
+
project_root=r"C:\Users\91990\Desktop",
|
|
4
|
+
hide_warnings=True,
|
|
5
|
+
hide_nltk_messages=True
|
|
6
|
+
):
|
|
7
|
+
"""
|
|
8
|
+
Runs the full SideEffect visualization pipeline.
|
|
9
|
+
- Uses trained PKL model + vectorizer
|
|
10
|
+
- Prints initial and filtered row counts
|
|
11
|
+
- Generates plots
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
# -----------------------------
|
|
15
|
+
# Setup
|
|
16
|
+
# -----------------------------
|
|
17
|
+
import os
|
|
18
|
+
import sys
|
|
19
|
+
import warnings
|
|
20
|
+
import pandas as pd
|
|
21
|
+
import matplotlib.pyplot as plt
|
|
22
|
+
|
|
23
|
+
if hide_warnings:
|
|
24
|
+
warnings.filterwarnings("ignore")
|
|
25
|
+
|
|
26
|
+
# Ensure project is importable
|
|
27
|
+
os.chdir(project_root)
|
|
28
|
+
sys.path.append(os.getcwd())
|
|
29
|
+
|
|
30
|
+
# -----------------------------
|
|
31
|
+
# Silence NLTK messages
|
|
32
|
+
# -----------------------------
|
|
33
|
+
if hide_nltk_messages:
|
|
34
|
+
import nltk
|
|
35
|
+
nltk.download("punkt", quiet=True)
|
|
36
|
+
nltk.download("wordnet", quiet=True)
|
|
37
|
+
nltk.download("words", quiet=True)
|
|
38
|
+
nltk.download("punkt_tab", quiet=True)
|
|
39
|
+
|
|
40
|
+
# -----------------------------
|
|
41
|
+
# Imports (after path setup)
|
|
42
|
+
# -----------------------------
|
|
43
|
+
from drugsideeffect.processing import main, process_onset_data
|
|
44
|
+
from drugsideeffect.visualization import (
|
|
45
|
+
plot_day_of_week_distribution,
|
|
46
|
+
plot_data_count_per_month,
|
|
47
|
+
plot_sentiment_distribution,
|
|
48
|
+
plot_known_symptoms,
|
|
49
|
+
plot_uncommon_side_effects_pie_chart,
|
|
50
|
+
plot_proportion_of_english_words,
|
|
51
|
+
plot_create_side_effects_correlation,
|
|
52
|
+
plot_create_side_effects_visualizations,
|
|
53
|
+
plot_visualize,
|
|
54
|
+
plot_symptom_extraction,
|
|
55
|
+
plot_onset_times,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# -----------------------------
|
|
59
|
+
# Load & classify data
|
|
60
|
+
# -----------------------------
|
|
61
|
+
df_initial = pd.read_csv(csv_file)
|
|
62
|
+
print(f"Initial CSV rows: {len(df_initial)}")
|
|
63
|
+
|
|
64
|
+
df = main(csv_file)
|
|
65
|
+
print(f"Rows after side effect filtering: {len(df)}")
|
|
66
|
+
|
|
67
|
+
# -----------------------------
|
|
68
|
+
# Onset / duration processing
|
|
69
|
+
# -----------------------------
|
|
70
|
+
df = process_onset_data(df)
|
|
71
|
+
|
|
72
|
+
# -----------------------------
|
|
73
|
+
# ALL PLOTS (none deleted)
|
|
74
|
+
# -----------------------------
|
|
75
|
+
plot_symptom_extraction(df); plt.show()
|
|
76
|
+
plot_day_of_week_distribution(df); plt.show()
|
|
77
|
+
plot_data_count_per_month(df); plt.show()
|
|
78
|
+
plot_sentiment_distribution(df); plt.show()
|
|
79
|
+
plot_create_side_effects_correlation(df); plt.show()
|
|
80
|
+
plot_create_side_effects_visualizations(df); plt.show()
|
|
81
|
+
plot_visualize(df); plt.show()
|
|
82
|
+
plot_known_symptoms(df); plt.show()
|
|
83
|
+
plot_uncommon_side_effects_pie_chart(df); plt.show()
|
|
84
|
+
plot_proportion_of_english_words(df); plt.show()
|
|
85
|
+
plot_onset_times(df); plt.show()
|
|
86
|
+
|
|
87
|
+
print("All plots generated successfully!")
|
|
88
|
+
|
|
89
|
+
return df
|