drugsideeffect 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,86 @@
1
+ # __init__.py
2
+
3
+ # Import main processing function
4
+ from .processing import main, process_onset_data, classify_sideeffects, clean_text, extract_symptoms, normalize_slang, extract_known_symptoms, extract_uncommon_symptoms, load_and_process_data
5
+
6
+ # Import all visualization functions
7
+ from .visualization import (
8
+ plot_day_of_week_distribution,
9
+ plot_data_count_per_month,
10
+ plot_sentiment_distribution,
11
+ plot_known_symptoms,
12
+ plot_uncommon_side_effects_pie_chart,
13
+ plot_proportion_of_english_words,
14
+ plot_create_side_effects_correlation,
15
+ plot_create_side_effects_visualizations,
16
+ plot_visualize,
17
+ plot_symptom_extraction,
18
+ plot_onset_times
19
+ )
20
+
21
+ # Package version
22
+ __version__ = "0.1.0"
23
+
24
+ # Define all accessible names
25
+ __all__ = [
26
+ "main",
27
+ "process_onset_data",
28
+ "classify_sideeffects",
29
+ "clean_text",
30
+ "extract_symptoms",
31
+ "normalize_slang",
32
+ "extract_known_symptoms",
33
+ "extract_uncommon_symptoms",
34
+ "load_and_process_data",
35
+ "plot_day_of_week_distribution",
36
+ "plot_data_count_per_month",
37
+ "plot_sentiment_distribution",
38
+ "plot_known_symptoms",
39
+ "plot_uncommon_side_effects_pie_chart",
40
+ "plot_proportion_of_english_words",
41
+ "plot_create_side_effects_correlation",
42
+ "plot_create_side_effects_visualizations",
43
+ "plot_visualize",
44
+ "plot_symptom_extraction",
45
+ "plot_onset_times"
46
+ ]
47
+
48
+ # -----------------------------
49
+ # Pipeline function
50
+ # -----------------------------
51
+ def sideeffect(input_file_path):
52
+ """
53
+ Run the full sideeffect analysis and visualization pipeline.
54
+ This includes:
55
+ - Side effect classification
56
+ - Text cleaning
57
+ - Symptom extraction (known/uncommon)
58
+ - Alert keyword detection
59
+ - All plotting functions
60
+ Returns the final DataFrame.
61
+ """
62
+ # Step 1: Load and process CSV
63
+ df = main(input_file_path)
64
+
65
+ # Step 2: Basic distributions
66
+ plot_day_of_week_distribution(df)
67
+ plot_data_count_per_month(df)
68
+ plot_sentiment_distribution(df)
69
+
70
+ # Step 3: Symptom plots
71
+ plot_known_symptoms(df)
72
+ plot_uncommon_side_effects_pie_chart(df)
73
+ plot_proportion_of_english_words(df)
74
+ plot_symptom_extraction(df)
75
+
76
+ # Step 4: Correlation & detailed visualizations
77
+ plot_create_side_effects_correlation(df)
78
+ plot_create_side_effects_visualizations(df)
79
+ plot_visualize(df)
80
+
81
+ # Step 5: Onset time plots
82
+ if 'Date' in df.columns:
83
+ df = process_onset_data(df)
84
+ plot_onset_times(df)
85
+
86
+ return df
@@ -0,0 +1,86 @@
1
+ # __init__.py
2
+
3
+ # Import main processing function
4
+ from .processing import main, process_onset_data, classify_sideeffects, clean_text, extract_symptoms, normalize_slang, extract_known_symptoms, extract_uncommon_symptoms, load_and_process_data
5
+
6
+ # Import all visualization functions
7
+ from .visualization import (
8
+ plot_day_of_week_distribution,
9
+ plot_data_count_per_month,
10
+ plot_sentiment_distribution,
11
+ plot_known_symptoms,
12
+ plot_uncommon_side_effects_pie_chart,
13
+ plot_proportion_of_english_words,
14
+ plot_create_side_effects_correlation,
15
+ plot_create_side_effects_visualizations,
16
+ plot_visualize,
17
+ plot_symptom_extraction,
18
+ plot_onset_times
19
+ )
20
+
21
+ # Package version
22
+ __version__ = "0.1.0"
23
+
24
+ # Define all accessible names
25
+ __all__ = [
26
+ "main",
27
+ "process_onset_data",
28
+ "classify_sideeffects",
29
+ "clean_text",
30
+ "extract_symptoms",
31
+ "normalize_slang",
32
+ "extract_known_symptoms",
33
+ "extract_uncommon_symptoms",
34
+ "load_and_process_data",
35
+ "plot_day_of_week_distribution",
36
+ "plot_data_count_per_month",
37
+ "plot_sentiment_distribution",
38
+ "plot_known_symptoms",
39
+ "plot_uncommon_side_effects_pie_chart",
40
+ "plot_proportion_of_english_words",
41
+ "plot_create_side_effects_correlation",
42
+ "plot_create_side_effects_visualizations",
43
+ "plot_visualize",
44
+ "plot_symptom_extraction",
45
+ "plot_onset_times"
46
+ ]
47
+
48
+ # -----------------------------
49
+ # Pipeline function
50
+ # -----------------------------
51
+ def sideeffect(input_file_path):
52
+ """
53
+ Run the full sideeffect analysis and visualization pipeline.
54
+ This includes:
55
+ - Side effect classification
56
+ - Text cleaning
57
+ - Symptom extraction (known/uncommon)
58
+ - Alert keyword detection
59
+ - All plotting functions
60
+ Returns the final DataFrame.
61
+ """
62
+ # Step 1: Load and process CSV
63
+ df = main(input_file_path)
64
+
65
+ # Step 2: Basic distributions
66
+ plot_day_of_week_distribution(df)
67
+ plot_data_count_per_month(df)
68
+ plot_sentiment_distribution(df)
69
+
70
+ # Step 3: Symptom plots
71
+ plot_known_symptoms(df)
72
+ plot_uncommon_side_effects_pie_chart(df)
73
+ plot_proportion_of_english_words(df)
74
+ plot_symptom_extraction(df)
75
+
76
+ # Step 4: Correlation & detailed visualizations
77
+ plot_create_side_effects_correlation(df)
78
+ plot_create_side_effects_visualizations(df)
79
+ plot_visualize(df)
80
+
81
+ # Step 5: Onset time plots
82
+ if 'Date' in df.columns:
83
+ df = process_onset_data(df)
84
+ plot_onset_times(df)
85
+
86
+ return df
File without changes
@@ -0,0 +1,283 @@
1
+ # Processing.py
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ import re
6
+ import os
7
+ from textblob import TextBlob
8
+ from datetime import datetime
9
+ import joblib
10
+
11
+ # -----------------------------
12
+ # Load pretrained artifacts
13
+ # -----------------------------
14
+ BASE_DIR = os.path.dirname(__file__)
15
+ MODEL_PATH = os.path.join(BASE_DIR, "models", "sideeffect_nb.pkl")
16
+ VEC_PATH = os.path.join(BASE_DIR, "models", "tfidf_vectorizer.pkl")
17
+
18
+ _model = joblib.load(MODEL_PATH)
19
+ _vectorizer = joblib.load(VEC_PATH)
20
+
21
+ # -----------------------------
22
+ # Side effect classification
23
+ # -----------------------------
24
+ def classify_sideeffects(df):
25
+ """
26
+ Adds a binary prediction column and filters only side effects.
27
+ Prints the number of rows before and after filtering.
28
+ """
29
+ if 'text' not in df.columns:
30
+ raise ValueError("DataFrame must contain 'text' column")
31
+
32
+ initial_rows = len(df)
33
+
34
+ X = df["text"].astype(str)
35
+ X_vec = _vectorizer.transform(X)
36
+
37
+ df["sideeffect_pred"] = _model.predict(X_vec)
38
+ # 1 = sideeffect, 0 = no_sideeffect
39
+
40
+ filtered_df = df[df["sideeffect_pred"] == 1].reset_index(drop=True)
41
+ final_rows = len(filtered_df)
42
+
43
+ print(f"Initial CSV had {initial_rows} rows; after filtering side effects, {final_rows} rows remain.")
44
+
45
+ return filtered_df
46
+
47
+ # -----------------------------
48
+ # Text cleaning
49
+ # -----------------------------
50
+ def clean_text(text):
51
+ """
52
+ Clean raw text by removing URLs, mentions, special characters,
53
+ and extra whitespace.
54
+ """
55
+ if not isinstance(text, str):
56
+ return ""
57
+
58
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text)
59
+ text = re.sub(r"@\w+", "", text)
60
+ text = re.sub(r"[^A-Za-z\s]", "", text)
61
+ text = re.sub(r"\s+", " ", text).strip()
62
+
63
+ return text.lower()
64
+
65
+ # -----------------------------
66
+ # Symptom extraction
67
+ # -----------------------------
68
+ def extract_symptoms(text, lexicon=None):
69
+ """
70
+ Extracts symptoms from text based on a given lexicon.
71
+ If no lexicon is provided, uses default known + uncommon symptoms.
72
+ """
73
+ if lexicon is None:
74
+ lexicon = [
75
+ "spike protein", "diabetes", "vascular", "autoimmune", "p53",
76
+ "t cell", "vitamin d", "contaminated", "zinc", "circumin",
77
+ "ivermectin", "cancer", "myocarditis", "hospital", "outpatient",
78
+ "inpatient", "infection", "bacteria", "fungal", "viral", "sepsis",
79
+ "respiratory", "gynaecology", "dermatology", "ophthalmology",
80
+ "otology", "dental", "hypoxia", "doxycycline", "nanosolver",
81
+ "anticoagulation", "aspirin", "stroke", "heart attack",
82
+ "coronary artery disease", "arrhythmia", "autism",
83
+ "neurodegenerative", "alzheimer", "cognitive",
84
+ "vascular dementia", "parkinson", "immune", "pots", "mcas",
85
+ "insomnia", "new onset dyslipidaemia", "hypertension",
86
+ "cardio-metabolic disturbance", "nervous system",
87
+ "mast cell activity in skin", "post-covid-19 vaccine syndrome",
88
+ "seizure disorders", "migraines", "neuropathy",
89
+ "inflammatory bowel disease", "depression", "anxiety disorders",
90
+ "chronic fatigue syndrome", "lyme disease", "fibromyalgia",
91
+ "arthritis", "chronic obstructive pulmonary disease", "copd",
92
+ "asthma", "chronic kidney disease", "ckd", "chronic heart failure",
93
+ "chf", "bleeding disorders", "atherosclerosis", "vasculopathies",
94
+ "endothelialitis", "thrombosis", "coagulopathy", "long covid",
95
+ "thrombocytopenia", "low platelet", "internal bleeding",
96
+ "lymphopenia", "neutropenia", "suppressed immune", "immune dysfunction",
97
+ "muscle pain", "joint pain", "vomiting", "fever", "autoimmunity",
98
+ "sleep apnea", "guillian barre syndrome", "adem", "cvst",
99
+ "spike amyloids hamper fibrinolysis", "sticky blood", "neuropsychiatric",
100
+ "mrna", "psychosis", "dementia", "schizophrenia", "suicidal",
101
+ "homicidal", "brain clot", "violent behavior", "cognitive decline",
102
+ "delusion", "takotsubo cardiomyopathy", "lipid nanoparticle toxicity",
103
+ "allergenic", "cytotoxic", "pneumonia", "endocrine",
104
+ "immune microclot", "vascular dysfunction", "teamclot",
105
+ "organ impairment", "endothelian diagnostic", "thromboembolic events",
106
+ "inflammatory cytokine increase", "allergic reactions", "igg increase",
107
+ "iga increase"
108
+ ]
109
+
110
+ if not isinstance(text, str):
111
+ return []
112
+ return [s for s in lexicon if s in text.lower()]
113
+
114
+ # -----------------------------
115
+ # Slang normalization
116
+ # -----------------------------
117
+ slang_lexicon = {
118
+ "feel like shit": ["fatigue", "malaise"],
119
+ "exhausted": ["fatigue"],
120
+ "shield against the storm": ["immune response", "general malaise"],
121
+ "i have been run over by a truck": ["muscle pain", "joint pain", "fatigue"],
122
+ "tired": ["fatigue"],
123
+ "knackered": ["fatigue"],
124
+ "wiped out": ["fatigue"],
125
+ "brain fog": ["cognitive"],
126
+ "sleepy all day": ["insomnia", "fatigue"],
127
+ "my head is pounding": ["headache"],
128
+ "can't sleep": ["insomnia"],
129
+ "heart racing": ["arrhythmia"],
130
+ "out of breath": ["respiratory distress", "fatigue"]
131
+ }
132
+
133
+ def normalize_slang(text, slang_lexicon=slang_lexicon):
134
+ """
135
+ Normalize slang terms based on a provided slang lexicon.
136
+ Returns a list of standard symptoms that correspond to slang.
137
+ """
138
+ if not isinstance(text, str):
139
+ return []
140
+
141
+ text_lower = text.lower()
142
+ matched_symptoms = []
143
+
144
+ for slang, symptoms in slang_lexicon.items():
145
+ if slang in text_lower:
146
+ matched_symptoms.extend(symptoms)
147
+
148
+ return list(set(matched_symptoms))
149
+
150
+ # -----------------------------
151
+ # Extract known/uncommon symptoms
152
+ # -----------------------------
153
+ def extract_known_symptoms(text):
154
+ known_symptoms_keywords = [
155
+ "fever", "fatigue", "headache", "muscle pain", "joint pain",
156
+ "vomiting", "insomnia", "cognitive", "anxiety disorders",
157
+ "depression", "respiratory", "asthma", "chronic fatigue syndrome",
158
+ "migraine", "neuropathy", "sleep apnea"
159
+ ]
160
+ if not isinstance(text, str):
161
+ return []
162
+ return [s for s in known_symptoms_keywords if s in text.lower()]
163
+
164
+ def extract_uncommon_symptoms(text):
165
+ uncommon_symptoms_keywords = [
166
+ "myocarditis", "stroke", "heart attack", "coronary artery disease",
167
+ "arrhythmia", "thrombosis", "coagulopathy", "thrombocytopenia",
168
+ "low platelet", "internal bleeding", "lymphopenia", "neutropenia",
169
+ "guillian barre syndrome", "adem", "cvst", "takotsubo cardiomyopathy",
170
+ "lipid nanoparticle toxicity", "brain clot", "psychosis", "schizophrenia",
171
+ "suicidal", "homicidal", "autoimmunity", "vascular dysfunction",
172
+ "immune dysfunction", "organ impairment", "spike amyloids hamper fibrinolysis",
173
+ "sticky blood", "neuropsychiatric", "post-covid-19 vaccine syndrome", "long covid"
174
+ ]
175
+ if not isinstance(text, str):
176
+ return []
177
+ return [s for s in uncommon_symptoms_keywords if s in text.lower()]
178
+
179
+ # -----------------------------
180
+ # Load and process CSV
181
+ # -----------------------------
182
+ def load_and_process_data(input_file_path, text_column="text"):
183
+ df = pd.read_csv(input_file_path)
184
+
185
+ if text_column not in df.columns:
186
+ raise ValueError(f"Column '{text_column}' not found in input file.")
187
+
188
+ df["cleaned_text"] = df[text_column].astype(str).apply(clean_text)
189
+ df["extracted_symptoms"] = df["cleaned_text"].apply(extract_known_symptoms)
190
+ df["extracted_uncommon_symptoms"] = df["cleaned_text"].apply(extract_uncommon_symptoms)
191
+ df["known_symptoms_flag"] = df["extracted_symptoms"].apply(lambda x: int(len(x) > 0))
192
+ df["uncommon_symptoms_flag"] = df["extracted_uncommon_symptoms"].apply(lambda x: int(len(x) > 0))
193
+
194
+ alert_keywords = ["urgent", "emergency", "severe", "critical", "immediate"]
195
+ df["alert_keywords_flag"] = df["cleaned_text"].apply(lambda x: int(any(k in x for k in alert_keywords)))
196
+
197
+ return df
198
+
199
+ # -----------------------------
200
+ # Main pipeline
201
+ # -----------------------------
202
+ def main(csv_path, text_column="text"):
203
+ df = pd.read_csv(csv_path)
204
+
205
+ if text_column not in df.columns:
206
+ raise ValueError(f"CSV must contain a '{text_column}' column")
207
+
208
+ # Step 1: Filter side effects
209
+ df = classify_sideeffects(df)
210
+
211
+ # Step 2: Clean text
212
+ df["cleaned_text"] = df[text_column].astype(str).apply(clean_text)
213
+
214
+ # Step 3: Extract symptoms (full default lexicon)
215
+ df["extracted_symptoms"] = df["cleaned_text"].apply(lambda x: extract_symptoms(x))
216
+ df["extracted_uncommon_symptoms"] = df["cleaned_text"].apply(lambda x: extract_symptoms(x))
217
+
218
+ df["known_symptoms_flag"] = df["extracted_symptoms"].apply(lambda x: int(len(x) > 0))
219
+ df["uncommon_symptoms_flag"] = df["extracted_uncommon_symptoms"].apply(lambda x: int(len(x) > 0))
220
+
221
+ # Step 4: Detect alert keywords
222
+ alert_keywords = ["urgent", "emergency", "severe", "critical", "immediate"]
223
+ df["alert_keywords_flag"] = df["cleaned_text"].apply(lambda x: int(any(k in x for k in alert_keywords)))
224
+
225
+ return df
226
+
227
+ # -----------------------------
228
+ # Onset time processing
229
+ # -----------------------------
230
+ def extract_onset_time(text):
231
+ if not isinstance(text, str):
232
+ return []
233
+ replacements = {
234
+ "a": 1, "few": 2, "couple": 2, "several": 3, "many": 5,
235
+ "dozen": 12, "half": 0.5, "long": 8, "short": 1, "some": 3,
236
+ "next": 24, "last": 24, "immediate": 0, "soon": 1, "this": 1,
237
+ "after": 1, "before": 1, "morning": 6, "afternoon": 6,
238
+ "evening": 6, "night": 8, "week": 168, "month": 730
239
+ }
240
+ time_patterns = [
241
+ r"(\d+|a|few|couple|several|many|dozen|half)\s*(hours?|days?)\s*(post-dose|after\s*vaccination|after\s*shot|after\s*injection|post-vaccine|post\s*jab)",
242
+ r"(\d+|a|few|couple|several|many|dozen|half)\s*(hour|day)\s*(after|since|post|following)"
243
+ ]
244
+ onset_times = []
245
+ for pattern in time_patterns:
246
+ match = re.search(pattern, text, re.IGNORECASE)
247
+ if match:
248
+ value = match.group(1).lower()
249
+ if value in replacements:
250
+ onset_times.append(replacements[value])
251
+ else:
252
+ try:
253
+ onset_times.append(int(value))
254
+ except ValueError:
255
+ continue
256
+ return onset_times
257
+
258
+ def calculate_duration(timestamp, onset_times):
259
+ symptom_duration = []
260
+ if timestamp is pd.NaT or not isinstance(onset_times, list):
261
+ return symptom_duration
262
+ for onset in onset_times:
263
+ if isinstance(onset, (int, float)):
264
+ symptom_duration.append(timestamp + pd.Timedelta(hours=onset))
265
+ return symptom_duration
266
+
267
+ def process_onset_data(df):
268
+ """
269
+ Prepares the DataFrame for onset/duration plotting.
270
+ Adds 'timestamp', 'onset_time', and 'symptom_duration' columns.
271
+ """
272
+ df = df.copy()
273
+ if 'Date' not in df.columns:
274
+ raise ValueError("CSV must contain a 'Date' column")
275
+ if 'text' not in df.columns:
276
+ raise ValueError("DataFrame must contain 'text' column")
277
+
278
+ df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
279
+ df = df.dropna(subset=['Date'])
280
+ df['timestamp'] = df['Date']
281
+ df['onset_time'] = df['text'].apply(extract_onset_time)
282
+ df['symptom_duration'] = df.apply(lambda row: calculate_duration(row['timestamp'], row['onset_time']), axis=1)
283
+ return df
@@ -0,0 +1,89 @@
1
+ def sideeffect_pipeline(
2
+ csv_file,
3
+ project_root=r"C:\Users\91990\Desktop",
4
+ hide_warnings=True,
5
+ hide_nltk_messages=True
6
+ ):
7
+ """
8
+ Runs the full SideEffect visualization pipeline.
9
+ - Uses trained PKL model + vectorizer
10
+ - Prints initial and filtered row counts
11
+ - Generates plots
12
+ """
13
+
14
+ # -----------------------------
15
+ # Setup
16
+ # -----------------------------
17
+ import os
18
+ import sys
19
+ import warnings
20
+ import pandas as pd
21
+ import matplotlib.pyplot as plt
22
+
23
+ if hide_warnings:
24
+ warnings.filterwarnings("ignore")
25
+
26
+ # Ensure project is importable
27
+ os.chdir(project_root)
28
+ sys.path.append(os.getcwd())
29
+
30
+ # -----------------------------
31
+ # Silence NLTK messages
32
+ # -----------------------------
33
+ if hide_nltk_messages:
34
+ import nltk
35
+ nltk.download("punkt", quiet=True)
36
+ nltk.download("wordnet", quiet=True)
37
+ nltk.download("words", quiet=True)
38
+ nltk.download("punkt_tab", quiet=True)
39
+
40
+ # -----------------------------
41
+ # Imports (after path setup)
42
+ # -----------------------------
43
+ from drugsideeffect.processing import main, process_onset_data
44
+ from drugsideeffect.visualization import (
45
+ plot_day_of_week_distribution,
46
+ plot_data_count_per_month,
47
+ plot_sentiment_distribution,
48
+ plot_known_symptoms,
49
+ plot_uncommon_side_effects_pie_chart,
50
+ plot_proportion_of_english_words,
51
+ plot_create_side_effects_correlation,
52
+ plot_create_side_effects_visualizations,
53
+ plot_visualize,
54
+ plot_symptom_extraction,
55
+ plot_onset_times,
56
+ )
57
+
58
+ # -----------------------------
59
+ # Load & classify data
60
+ # -----------------------------
61
+ df_initial = pd.read_csv(csv_file)
62
+ print(f"Initial CSV rows: {len(df_initial)}")
63
+
64
+ df = main(csv_file)
65
+ print(f"Rows after side effect filtering: {len(df)}")
66
+
67
+ # -----------------------------
68
+ # Onset / duration processing
69
+ # -----------------------------
70
+ df = process_onset_data(df)
71
+
72
+ # -----------------------------
73
+ # ALL PLOTS (none deleted)
74
+ # -----------------------------
75
+ plot_symptom_extraction(df); plt.show()
76
+ plot_day_of_week_distribution(df); plt.show()
77
+ plot_data_count_per_month(df); plt.show()
78
+ plot_sentiment_distribution(df); plt.show()
79
+ plot_create_side_effects_correlation(df); plt.show()
80
+ plot_create_side_effects_visualizations(df); plt.show()
81
+ plot_visualize(df); plt.show()
82
+ plot_known_symptoms(df); plt.show()
83
+ plot_uncommon_side_effects_pie_chart(df); plt.show()
84
+ plot_proportion_of_english_words(df); plt.show()
85
+ plot_onset_times(df); plt.show()
86
+
87
+ print("All plots generated successfully!")
88
+
89
+ return df