drugsideeffect 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- drugsideeffect-0.1.4/MANIFEST.in +2 -0
- drugsideeffect-0.1.4/PKG-INFO +152 -0
- drugsideeffect-0.1.4/drugsideeffect/__init__.py +86 -0
- drugsideeffect-0.1.4/drugsideeffect/drugsideeffect/__init__.py +86 -0
- drugsideeffect-0.1.4/drugsideeffect/drugsideeffect/models/__init__.py +0 -0
- drugsideeffect-0.1.4/drugsideeffect/drugsideeffect/processing.py +283 -0
- drugsideeffect-0.1.4/drugsideeffect/drugsideeffect/sideeffect_pipeline.py +89 -0
- drugsideeffect-0.1.4/drugsideeffect/drugsideeffect/visualization.py +1700 -0
- drugsideeffect-0.1.4/drugsideeffect/models/__init__.py +0 -0
- drugsideeffect-0.1.4/drugsideeffect/models/sideeffect_nb.pkl +0 -0
- drugsideeffect-0.1.4/drugsideeffect/models/tfidf_vectorizer.pkl +0 -0
- drugsideeffect-0.1.4/drugsideeffect/processing.py +283 -0
- drugsideeffect-0.1.4/drugsideeffect/setup.py +37 -0
- drugsideeffect-0.1.4/drugsideeffect/sideeffect_pipeline.py +89 -0
- drugsideeffect-0.1.4/drugsideeffect/visualization.py +1700 -0
- drugsideeffect-0.1.4/drugsideeffect.egg-info/PKG-INFO +152 -0
- drugsideeffect-0.1.4/drugsideeffect.egg-info/SOURCES.txt +22 -0
- drugsideeffect-0.1.4/drugsideeffect.egg-info/dependency_links.txt +1 -0
- drugsideeffect-0.1.4/drugsideeffect.egg-info/not-zip-safe +1 -0
- drugsideeffect-0.1.4/drugsideeffect.egg-info/requires.txt +8 -0
- drugsideeffect-0.1.4/drugsideeffect.egg-info/top_level.txt +1 -0
- drugsideeffect-0.1.4/readme.md +123 -0
- drugsideeffect-0.1.4/setup.cfg +4 -0
- drugsideeffect-0.1.4/setup.py +37 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: drugsideeffect
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: Visualize side effects from textual data
|
|
5
|
+
Home-page: https://github.com/debbdeb/drugsideeffect
|
|
6
|
+
Author: Briti Deb
|
|
7
|
+
Author-email: britideb@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.8
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: pandas>=1.5
|
|
13
|
+
Requires-Dist: numpy>=1.23
|
|
14
|
+
Requires-Dist: matplotlib>=3.6
|
|
15
|
+
Requires-Dist: seaborn>=0.12
|
|
16
|
+
Requires-Dist: plotly>=5.11
|
|
17
|
+
Requires-Dist: nltk>=3.8
|
|
18
|
+
Requires-Dist: textblob>=0.10
|
|
19
|
+
Requires-Dist: spacy>=3.5
|
|
20
|
+
Dynamic: author
|
|
21
|
+
Dynamic: author-email
|
|
22
|
+
Dynamic: classifier
|
|
23
|
+
Dynamic: description
|
|
24
|
+
Dynamic: description-content-type
|
|
25
|
+
Dynamic: home-page
|
|
26
|
+
Dynamic: requires-dist
|
|
27
|
+
Dynamic: requires-python
|
|
28
|
+
Dynamic: summary
|
|
29
|
+
|
|
30
|
+
# Python package sideeffect
|
|
31
|
+
|
|
32
|
+
**sideeffect** is a Python library for processing textual health-related data to extract, analyze, and visualize potential side effects of pharmaceutical drugs from text sources such as social media posts, surveys, or reports.
|
|
33
|
+
|
|
34
|
+
It provides pipeline for:
|
|
35
|
+
- Cleaning and preprocessing text
|
|
36
|
+
- Extracting known and uncommon symptoms
|
|
37
|
+
- Detecting alert-related keywords
|
|
38
|
+
- Generating multiple analytical visualizations
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
# Disclaimer
|
|
43
|
+
|
|
44
|
+
**This library is only for informational, educational, and research purposes only.**
|
|
45
|
+
It is **not intended to provide medical advice, diagnosis, or treatment**.
|
|
46
|
+
Do **not** use it as a substitute for professional healthcare guidance.
|
|
47
|
+
Always consult a qualified healthcare professional regarding any medical concerns.
|
|
48
|
+
|
|
49
|
+
Note: The code is provided for demonstration purposes; it is not fully optimized and generates only basic visualizations.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
# Installation from PyPI:
|
|
54
|
+
|
|
55
|
+
%pip install sideeffect
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Quick Start Example
|
|
59
|
+
|
|
60
|
+
import warnings
|
|
61
|
+
warnings.filterwarnings("ignore") # Hide warnings
|
|
62
|
+
|
|
63
|
+
import os
|
|
64
|
+
import pandas as pd
|
|
65
|
+
|
|
66
|
+
# Set working directory
|
|
67
|
+
os.chdir(r"C:\")
|
|
68
|
+
|
|
69
|
+
# Add the project folder to path
|
|
70
|
+
import sys
|
|
71
|
+
sys.path.append(r"C:\")
|
|
72
|
+
|
|
73
|
+
# Import the custom functions
|
|
74
|
+
from sideeffect.processing import main
|
|
75
|
+
from sideeffect.visualization import (
|
|
76
|
+
plot_day_of_week_distribution,
|
|
77
|
+
plot_data_count_per_month,
|
|
78
|
+
plot_known_symptoms,
|
|
79
|
+
plot_uncommon_side_effects_pie_chart,
|
|
80
|
+
plot_proportion_of_english_words,
|
|
81
|
+
plot_create_side_effects_correlation,
|
|
82
|
+
plot_create_side_effects_visualizations,
|
|
83
|
+
plot_sentiment_distribution,
|
|
84
|
+
plot_visualize,
|
|
85
|
+
plot_symptom_extraction,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# -----------------------------
|
|
90
|
+
# Step 1: Load a csv file having at least four columns for example:
|
|
91
|
+
## Headers: Date, month, RT_Like, text
|
|
92
|
+
## Row values: 3/15/2023 8:23, 4, 1, Day 2 and I feel better
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# -----------------------------
|
|
96
|
+
df = pd.read_csv("data.csv")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# -----------------------------
|
|
100
|
+
# Step 2: Extract symptoms & alert keywords
|
|
101
|
+
# -----------------------------
|
|
102
|
+
# This must be done first so that derived columns like 'extracted_symptoms' exist
|
|
103
|
+
plot_symptom_extraction(df)
|
|
104
|
+
|
|
105
|
+
# -----------------------------
|
|
106
|
+
# Step 3: Basic distributions & sentiment analysis
|
|
107
|
+
# -----------------------------
|
|
108
|
+
plot_day_of_week_distribution(df)
|
|
109
|
+
plot_data_count_per_month(df)
|
|
110
|
+
plot_sentiment_distribution(df)
|
|
111
|
+
|
|
112
|
+
# -----------------------------
|
|
113
|
+
# Step 4: Side effects correlation & detailed visualizations
|
|
114
|
+
# -----------------------------
|
|
115
|
+
plot_create_side_effects_correlation(df)
|
|
116
|
+
plot_create_side_effects_visualizations(df)
|
|
117
|
+
plot_visualize(df)
|
|
118
|
+
|
|
119
|
+
# -----------------------------
|
|
120
|
+
# Step 5: Plots that depend on extracted columns
|
|
121
|
+
# -----------------------------
|
|
122
|
+
plot_known_symptoms(df)
|
|
123
|
+
plot_uncommon_side_effects_pie_chart(df)
|
|
124
|
+
plot_proportion_of_english_words(df)
|
|
125
|
+
|
|
126
|
+
print("All plots generated successfully!")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# License
|
|
132
|
+
MIT License
|
|
133
|
+
|
|
134
|
+
Copyright (c) 2026 Briti Deb
|
|
135
|
+
|
|
136
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
137
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
138
|
+
in the Software without restriction, including without limitation the rights
|
|
139
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
140
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
141
|
+
furnished to do so, subject to the following conditions:
|
|
142
|
+
|
|
143
|
+
The above copyright notice and this permission notice shall be included in all
|
|
144
|
+
copies or substantial portions of the Software.
|
|
145
|
+
|
|
146
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
147
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
148
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
149
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
150
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
151
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
152
|
+
SOFTWARE.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# __init__.py
|
|
2
|
+
|
|
3
|
+
# Import main processing function
|
|
4
|
+
from .processing import main, process_onset_data, classify_sideeffects, clean_text, extract_symptoms, normalize_slang, extract_known_symptoms, extract_uncommon_symptoms, load_and_process_data
|
|
5
|
+
|
|
6
|
+
# Import all visualization functions
|
|
7
|
+
from .visualization import (
|
|
8
|
+
plot_day_of_week_distribution,
|
|
9
|
+
plot_data_count_per_month,
|
|
10
|
+
plot_sentiment_distribution,
|
|
11
|
+
plot_known_symptoms,
|
|
12
|
+
plot_uncommon_side_effects_pie_chart,
|
|
13
|
+
plot_proportion_of_english_words,
|
|
14
|
+
plot_create_side_effects_correlation,
|
|
15
|
+
plot_create_side_effects_visualizations,
|
|
16
|
+
plot_visualize,
|
|
17
|
+
plot_symptom_extraction,
|
|
18
|
+
plot_onset_times
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Package version
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
|
|
24
|
+
# Define all accessible names
|
|
25
|
+
__all__ = [
|
|
26
|
+
"main",
|
|
27
|
+
"process_onset_data",
|
|
28
|
+
"classify_sideeffects",
|
|
29
|
+
"clean_text",
|
|
30
|
+
"extract_symptoms",
|
|
31
|
+
"normalize_slang",
|
|
32
|
+
"extract_known_symptoms",
|
|
33
|
+
"extract_uncommon_symptoms",
|
|
34
|
+
"load_and_process_data",
|
|
35
|
+
"plot_day_of_week_distribution",
|
|
36
|
+
"plot_data_count_per_month",
|
|
37
|
+
"plot_sentiment_distribution",
|
|
38
|
+
"plot_known_symptoms",
|
|
39
|
+
"plot_uncommon_side_effects_pie_chart",
|
|
40
|
+
"plot_proportion_of_english_words",
|
|
41
|
+
"plot_create_side_effects_correlation",
|
|
42
|
+
"plot_create_side_effects_visualizations",
|
|
43
|
+
"plot_visualize",
|
|
44
|
+
"plot_symptom_extraction",
|
|
45
|
+
"plot_onset_times"
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# -----------------------------
|
|
49
|
+
# Pipeline function
|
|
50
|
+
# -----------------------------
|
|
51
|
+
def sideeffect(input_file_path):
|
|
52
|
+
"""
|
|
53
|
+
Run the full sideeffect analysis and visualization pipeline.
|
|
54
|
+
This includes:
|
|
55
|
+
- Side effect classification
|
|
56
|
+
- Text cleaning
|
|
57
|
+
- Symptom extraction (known/uncommon)
|
|
58
|
+
- Alert keyword detection
|
|
59
|
+
- All plotting functions
|
|
60
|
+
Returns the final DataFrame.
|
|
61
|
+
"""
|
|
62
|
+
# Step 1: Load and process CSV
|
|
63
|
+
df = main(input_file_path)
|
|
64
|
+
|
|
65
|
+
# Step 2: Basic distributions
|
|
66
|
+
plot_day_of_week_distribution(df)
|
|
67
|
+
plot_data_count_per_month(df)
|
|
68
|
+
plot_sentiment_distribution(df)
|
|
69
|
+
|
|
70
|
+
# Step 3: Symptom plots
|
|
71
|
+
plot_known_symptoms(df)
|
|
72
|
+
plot_uncommon_side_effects_pie_chart(df)
|
|
73
|
+
plot_proportion_of_english_words(df)
|
|
74
|
+
plot_symptom_extraction(df)
|
|
75
|
+
|
|
76
|
+
# Step 4: Correlation & detailed visualizations
|
|
77
|
+
plot_create_side_effects_correlation(df)
|
|
78
|
+
plot_create_side_effects_visualizations(df)
|
|
79
|
+
plot_visualize(df)
|
|
80
|
+
|
|
81
|
+
# Step 5: Onset time plots
|
|
82
|
+
if 'Date' in df.columns:
|
|
83
|
+
df = process_onset_data(df)
|
|
84
|
+
plot_onset_times(df)
|
|
85
|
+
|
|
86
|
+
return df
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# __init__.py
|
|
2
|
+
|
|
3
|
+
# Import main processing function
|
|
4
|
+
from .processing import main, process_onset_data, classify_sideeffects, clean_text, extract_symptoms, normalize_slang, extract_known_symptoms, extract_uncommon_symptoms, load_and_process_data
|
|
5
|
+
|
|
6
|
+
# Import all visualization functions
|
|
7
|
+
from .visualization import (
|
|
8
|
+
plot_day_of_week_distribution,
|
|
9
|
+
plot_data_count_per_month,
|
|
10
|
+
plot_sentiment_distribution,
|
|
11
|
+
plot_known_symptoms,
|
|
12
|
+
plot_uncommon_side_effects_pie_chart,
|
|
13
|
+
plot_proportion_of_english_words,
|
|
14
|
+
plot_create_side_effects_correlation,
|
|
15
|
+
plot_create_side_effects_visualizations,
|
|
16
|
+
plot_visualize,
|
|
17
|
+
plot_symptom_extraction,
|
|
18
|
+
plot_onset_times
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Package version
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
|
|
24
|
+
# Define all accessible names
|
|
25
|
+
__all__ = [
|
|
26
|
+
"main",
|
|
27
|
+
"process_onset_data",
|
|
28
|
+
"classify_sideeffects",
|
|
29
|
+
"clean_text",
|
|
30
|
+
"extract_symptoms",
|
|
31
|
+
"normalize_slang",
|
|
32
|
+
"extract_known_symptoms",
|
|
33
|
+
"extract_uncommon_symptoms",
|
|
34
|
+
"load_and_process_data",
|
|
35
|
+
"plot_day_of_week_distribution",
|
|
36
|
+
"plot_data_count_per_month",
|
|
37
|
+
"plot_sentiment_distribution",
|
|
38
|
+
"plot_known_symptoms",
|
|
39
|
+
"plot_uncommon_side_effects_pie_chart",
|
|
40
|
+
"plot_proportion_of_english_words",
|
|
41
|
+
"plot_create_side_effects_correlation",
|
|
42
|
+
"plot_create_side_effects_visualizations",
|
|
43
|
+
"plot_visualize",
|
|
44
|
+
"plot_symptom_extraction",
|
|
45
|
+
"plot_onset_times"
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# -----------------------------
|
|
49
|
+
# Pipeline function
|
|
50
|
+
# -----------------------------
|
|
51
|
+
def sideeffect(input_file_path):
|
|
52
|
+
"""
|
|
53
|
+
Run the full sideeffect analysis and visualization pipeline.
|
|
54
|
+
This includes:
|
|
55
|
+
- Side effect classification
|
|
56
|
+
- Text cleaning
|
|
57
|
+
- Symptom extraction (known/uncommon)
|
|
58
|
+
- Alert keyword detection
|
|
59
|
+
- All plotting functions
|
|
60
|
+
Returns the final DataFrame.
|
|
61
|
+
"""
|
|
62
|
+
# Step 1: Load and process CSV
|
|
63
|
+
df = main(input_file_path)
|
|
64
|
+
|
|
65
|
+
# Step 2: Basic distributions
|
|
66
|
+
plot_day_of_week_distribution(df)
|
|
67
|
+
plot_data_count_per_month(df)
|
|
68
|
+
plot_sentiment_distribution(df)
|
|
69
|
+
|
|
70
|
+
# Step 3: Symptom plots
|
|
71
|
+
plot_known_symptoms(df)
|
|
72
|
+
plot_uncommon_side_effects_pie_chart(df)
|
|
73
|
+
plot_proportion_of_english_words(df)
|
|
74
|
+
plot_symptom_extraction(df)
|
|
75
|
+
|
|
76
|
+
# Step 4: Correlation & detailed visualizations
|
|
77
|
+
plot_create_side_effects_correlation(df)
|
|
78
|
+
plot_create_side_effects_visualizations(df)
|
|
79
|
+
plot_visualize(df)
|
|
80
|
+
|
|
81
|
+
# Step 5: Onset time plots
|
|
82
|
+
if 'Date' in df.columns:
|
|
83
|
+
df = process_onset_data(df)
|
|
84
|
+
plot_onset_times(df)
|
|
85
|
+
|
|
86
|
+
return df
|
|
File without changes
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# Processing.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
import re
|
|
6
|
+
import os
|
|
7
|
+
from textblob import TextBlob
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
import joblib
|
|
10
|
+
|
|
11
|
+
# -----------------------------
|
|
12
|
+
# Load pretrained artifacts
|
|
13
|
+
# -----------------------------
|
|
14
|
+
BASE_DIR = os.path.dirname(__file__)
|
|
15
|
+
MODEL_PATH = os.path.join(BASE_DIR, "models", "sideeffect_nb.pkl")
|
|
16
|
+
VEC_PATH = os.path.join(BASE_DIR, "models", "tfidf_vectorizer.pkl")
|
|
17
|
+
|
|
18
|
+
_model = joblib.load(MODEL_PATH)
|
|
19
|
+
_vectorizer = joblib.load(VEC_PATH)
|
|
20
|
+
|
|
21
|
+
# -----------------------------
|
|
22
|
+
# Side effect classification
|
|
23
|
+
# -----------------------------
|
|
24
|
+
def classify_sideeffects(df):
|
|
25
|
+
"""
|
|
26
|
+
Adds a binary prediction column and filters only side effects.
|
|
27
|
+
Prints the number of rows before and after filtering.
|
|
28
|
+
"""
|
|
29
|
+
if 'text' not in df.columns:
|
|
30
|
+
raise ValueError("DataFrame must contain 'text' column")
|
|
31
|
+
|
|
32
|
+
initial_rows = len(df)
|
|
33
|
+
|
|
34
|
+
X = df["text"].astype(str)
|
|
35
|
+
X_vec = _vectorizer.transform(X)
|
|
36
|
+
|
|
37
|
+
df["sideeffect_pred"] = _model.predict(X_vec)
|
|
38
|
+
# 1 = sideeffect, 0 = no_sideeffect
|
|
39
|
+
|
|
40
|
+
filtered_df = df[df["sideeffect_pred"] == 1].reset_index(drop=True)
|
|
41
|
+
final_rows = len(filtered_df)
|
|
42
|
+
|
|
43
|
+
print(f"Initial CSV had {initial_rows} rows; after filtering side effects, {final_rows} rows remain.")
|
|
44
|
+
|
|
45
|
+
return filtered_df
|
|
46
|
+
|
|
47
|
+
# -----------------------------
|
|
48
|
+
# Text cleaning
|
|
49
|
+
# -----------------------------
|
|
50
|
+
def clean_text(text):
|
|
51
|
+
"""
|
|
52
|
+
Clean raw text by removing URLs, mentions, special characters,
|
|
53
|
+
and extra whitespace.
|
|
54
|
+
"""
|
|
55
|
+
if not isinstance(text, str):
|
|
56
|
+
return ""
|
|
57
|
+
|
|
58
|
+
text = re.sub(r"http\S+|www\S+|https\S+", "", text)
|
|
59
|
+
text = re.sub(r"@\w+", "", text)
|
|
60
|
+
text = re.sub(r"[^A-Za-z\s]", "", text)
|
|
61
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
62
|
+
|
|
63
|
+
return text.lower()
|
|
64
|
+
|
|
65
|
+
# -----------------------------
|
|
66
|
+
# Symptom extraction
|
|
67
|
+
# -----------------------------
|
|
68
|
+
def extract_symptoms(text, lexicon=None):
|
|
69
|
+
"""
|
|
70
|
+
Extracts symptoms from text based on a given lexicon.
|
|
71
|
+
If no lexicon is provided, uses default known + uncommon symptoms.
|
|
72
|
+
"""
|
|
73
|
+
if lexicon is None:
|
|
74
|
+
lexicon = [
|
|
75
|
+
"spike protein", "diabetes", "vascular", "autoimmune", "p53",
|
|
76
|
+
"t cell", "vitamin d", "contaminated", "zinc", "circumin",
|
|
77
|
+
"ivermectin", "cancer", "myocarditis", "hospital", "outpatient",
|
|
78
|
+
"inpatient", "infection", "bacteria", "fungal", "viral", "sepsis",
|
|
79
|
+
"respiratory", "gynaecology", "dermatology", "ophthalmology",
|
|
80
|
+
"otology", "dental", "hypoxia", "doxycycline", "nanosolver",
|
|
81
|
+
"anticoagulation", "aspirin", "stroke", "heart attack",
|
|
82
|
+
"coronary artery disease", "arrhythmia", "autism",
|
|
83
|
+
"neurodegenerative", "alzheimer", "cognitive",
|
|
84
|
+
"vascular dementia", "parkinson", "immune", "pots", "mcas",
|
|
85
|
+
"insomnia", "new onset dyslipidaemia", "hypertension",
|
|
86
|
+
"cardio-metabolic disturbance", "nervous system",
|
|
87
|
+
"mast cell activity in skin", "post-covid-19 vaccine syndrome",
|
|
88
|
+
"seizure disorders", "migraines", "neuropathy",
|
|
89
|
+
"inflammatory bowel disease", "depression", "anxiety disorders",
|
|
90
|
+
"chronic fatigue syndrome", "lyme disease", "fibromyalgia",
|
|
91
|
+
"arthritis", "chronic obstructive pulmonary disease", "copd",
|
|
92
|
+
"asthma", "chronic kidney disease", "ckd", "chronic heart failure",
|
|
93
|
+
"chf", "bleeding disorders", "atherosclerosis", "vasculopathies",
|
|
94
|
+
"endothelialitis", "thrombosis", "coagulopathy", "long covid",
|
|
95
|
+
"thrombocytopenia", "low platelet", "internal bleeding",
|
|
96
|
+
"lymphopenia", "neutropenia", "suppressed immune", "immune dysfunction",
|
|
97
|
+
"muscle pain", "joint pain", "vomiting", "fever", "autoimmunity",
|
|
98
|
+
"sleep apnea", "guillian barre syndrome", "adem", "cvst",
|
|
99
|
+
"spike amyloids hamper fibrinolysis", "sticky blood", "neuropsychiatric",
|
|
100
|
+
"mrna", "psychosis", "dementia", "schizophrenia", "suicidal",
|
|
101
|
+
"homicidal", "brain clot", "violent behavior", "cognitive decline",
|
|
102
|
+
"delusion", "takotsubo cardiomyopathy", "lipid nanoparticle toxicity",
|
|
103
|
+
"allergenic", "cytotoxic", "pneumonia", "endocrine",
|
|
104
|
+
"immune microclot", "vascular dysfunction", "teamclot",
|
|
105
|
+
"organ impairment", "endothelian diagnostic", "thromboembolic events",
|
|
106
|
+
"inflammatory cytokine increase", "allergic reactions", "igg increase",
|
|
107
|
+
"iga increase"
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
if not isinstance(text, str):
|
|
111
|
+
return []
|
|
112
|
+
return [s for s in lexicon if s in text.lower()]
|
|
113
|
+
|
|
114
|
+
# -----------------------------
|
|
115
|
+
# Slang normalization
|
|
116
|
+
# -----------------------------
|
|
117
|
+
slang_lexicon = {
|
|
118
|
+
"feel like shit": ["fatigue", "malaise"],
|
|
119
|
+
"exhausted": ["fatigue"],
|
|
120
|
+
"shield against the storm": ["immune response", "general malaise"],
|
|
121
|
+
"i have been run over by a truck": ["muscle pain", "joint pain", "fatigue"],
|
|
122
|
+
"tired": ["fatigue"],
|
|
123
|
+
"knackered": ["fatigue"],
|
|
124
|
+
"wiped out": ["fatigue"],
|
|
125
|
+
"brain fog": ["cognitive"],
|
|
126
|
+
"sleepy all day": ["insomnia", "fatigue"],
|
|
127
|
+
"my head is pounding": ["headache"],
|
|
128
|
+
"can't sleep": ["insomnia"],
|
|
129
|
+
"heart racing": ["arrhythmia"],
|
|
130
|
+
"out of breath": ["respiratory distress", "fatigue"]
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
def normalize_slang(text, slang_lexicon=slang_lexicon):
|
|
134
|
+
"""
|
|
135
|
+
Normalize slang terms based on a provided slang lexicon.
|
|
136
|
+
Returns a list of standard symptoms that correspond to slang.
|
|
137
|
+
"""
|
|
138
|
+
if not isinstance(text, str):
|
|
139
|
+
return []
|
|
140
|
+
|
|
141
|
+
text_lower = text.lower()
|
|
142
|
+
matched_symptoms = []
|
|
143
|
+
|
|
144
|
+
for slang, symptoms in slang_lexicon.items():
|
|
145
|
+
if slang in text_lower:
|
|
146
|
+
matched_symptoms.extend(symptoms)
|
|
147
|
+
|
|
148
|
+
return list(set(matched_symptoms))
|
|
149
|
+
|
|
150
|
+
# -----------------------------
|
|
151
|
+
# Extract known/uncommon symptoms
|
|
152
|
+
# -----------------------------
|
|
153
|
+
def extract_known_symptoms(text):
|
|
154
|
+
known_symptoms_keywords = [
|
|
155
|
+
"fever", "fatigue", "headache", "muscle pain", "joint pain",
|
|
156
|
+
"vomiting", "insomnia", "cognitive", "anxiety disorders",
|
|
157
|
+
"depression", "respiratory", "asthma", "chronic fatigue syndrome",
|
|
158
|
+
"migraine", "neuropathy", "sleep apnea"
|
|
159
|
+
]
|
|
160
|
+
if not isinstance(text, str):
|
|
161
|
+
return []
|
|
162
|
+
return [s for s in known_symptoms_keywords if s in text.lower()]
|
|
163
|
+
|
|
164
|
+
def extract_uncommon_symptoms(text):
|
|
165
|
+
uncommon_symptoms_keywords = [
|
|
166
|
+
"myocarditis", "stroke", "heart attack", "coronary artery disease",
|
|
167
|
+
"arrhythmia", "thrombosis", "coagulopathy", "thrombocytopenia",
|
|
168
|
+
"low platelet", "internal bleeding", "lymphopenia", "neutropenia",
|
|
169
|
+
"guillian barre syndrome", "adem", "cvst", "takotsubo cardiomyopathy",
|
|
170
|
+
"lipid nanoparticle toxicity", "brain clot", "psychosis", "schizophrenia",
|
|
171
|
+
"suicidal", "homicidal", "autoimmunity", "vascular dysfunction",
|
|
172
|
+
"immune dysfunction", "organ impairment", "spike amyloids hamper fibrinolysis",
|
|
173
|
+
"sticky blood", "neuropsychiatric", "post-covid-19 vaccine syndrome", "long covid"
|
|
174
|
+
]
|
|
175
|
+
if not isinstance(text, str):
|
|
176
|
+
return []
|
|
177
|
+
return [s for s in uncommon_symptoms_keywords if s in text.lower()]
|
|
178
|
+
|
|
179
|
+
# -----------------------------
|
|
180
|
+
# Load and process CSV
|
|
181
|
+
# -----------------------------
|
|
182
|
+
def load_and_process_data(input_file_path, text_column="text"):
|
|
183
|
+
df = pd.read_csv(input_file_path)
|
|
184
|
+
|
|
185
|
+
if text_column not in df.columns:
|
|
186
|
+
raise ValueError(f"Column '{text_column}' not found in input file.")
|
|
187
|
+
|
|
188
|
+
df["cleaned_text"] = df[text_column].astype(str).apply(clean_text)
|
|
189
|
+
df["extracted_symptoms"] = df["cleaned_text"].apply(extract_known_symptoms)
|
|
190
|
+
df["extracted_uncommon_symptoms"] = df["cleaned_text"].apply(extract_uncommon_symptoms)
|
|
191
|
+
df["known_symptoms_flag"] = df["extracted_symptoms"].apply(lambda x: int(len(x) > 0))
|
|
192
|
+
df["uncommon_symptoms_flag"] = df["extracted_uncommon_symptoms"].apply(lambda x: int(len(x) > 0))
|
|
193
|
+
|
|
194
|
+
alert_keywords = ["urgent", "emergency", "severe", "critical", "immediate"]
|
|
195
|
+
df["alert_keywords_flag"] = df["cleaned_text"].apply(lambda x: int(any(k in x for k in alert_keywords)))
|
|
196
|
+
|
|
197
|
+
return df
|
|
198
|
+
|
|
199
|
+
# -----------------------------
|
|
200
|
+
# Main pipeline
|
|
201
|
+
# -----------------------------
|
|
202
|
+
def main(csv_path, text_column="text"):
|
|
203
|
+
df = pd.read_csv(csv_path)
|
|
204
|
+
|
|
205
|
+
if text_column not in df.columns:
|
|
206
|
+
raise ValueError(f"CSV must contain a '{text_column}' column")
|
|
207
|
+
|
|
208
|
+
# Step 1: Filter side effects
|
|
209
|
+
df = classify_sideeffects(df)
|
|
210
|
+
|
|
211
|
+
# Step 2: Clean text
|
|
212
|
+
df["cleaned_text"] = df[text_column].astype(str).apply(clean_text)
|
|
213
|
+
|
|
214
|
+
# Step 3: Extract symptoms (full default lexicon)
|
|
215
|
+
df["extracted_symptoms"] = df["cleaned_text"].apply(lambda x: extract_symptoms(x))
|
|
216
|
+
df["extracted_uncommon_symptoms"] = df["cleaned_text"].apply(lambda x: extract_symptoms(x))
|
|
217
|
+
|
|
218
|
+
df["known_symptoms_flag"] = df["extracted_symptoms"].apply(lambda x: int(len(x) > 0))
|
|
219
|
+
df["uncommon_symptoms_flag"] = df["extracted_uncommon_symptoms"].apply(lambda x: int(len(x) > 0))
|
|
220
|
+
|
|
221
|
+
# Step 4: Detect alert keywords
|
|
222
|
+
alert_keywords = ["urgent", "emergency", "severe", "critical", "immediate"]
|
|
223
|
+
df["alert_keywords_flag"] = df["cleaned_text"].apply(lambda x: int(any(k in x for k in alert_keywords)))
|
|
224
|
+
|
|
225
|
+
return df
|
|
226
|
+
|
|
227
|
+
# -----------------------------
|
|
228
|
+
# Onset time processing
|
|
229
|
+
# -----------------------------
|
|
230
|
+
def extract_onset_time(text):
|
|
231
|
+
if not isinstance(text, str):
|
|
232
|
+
return []
|
|
233
|
+
replacements = {
|
|
234
|
+
"a": 1, "few": 2, "couple": 2, "several": 3, "many": 5,
|
|
235
|
+
"dozen": 12, "half": 0.5, "long": 8, "short": 1, "some": 3,
|
|
236
|
+
"next": 24, "last": 24, "immediate": 0, "soon": 1, "this": 1,
|
|
237
|
+
"after": 1, "before": 1, "morning": 6, "afternoon": 6,
|
|
238
|
+
"evening": 6, "night": 8, "week": 168, "month": 730
|
|
239
|
+
}
|
|
240
|
+
time_patterns = [
|
|
241
|
+
r"(\d+|a|few|couple|several|many|dozen|half)\s*(hours?|days?)\s*(post-dose|after\s*vaccination|after\s*shot|after\s*injection|post-vaccine|post\s*jab)",
|
|
242
|
+
r"(\d+|a|few|couple|several|many|dozen|half)\s*(hour|day)\s*(after|since|post|following)"
|
|
243
|
+
]
|
|
244
|
+
onset_times = []
|
|
245
|
+
for pattern in time_patterns:
|
|
246
|
+
match = re.search(pattern, text, re.IGNORECASE)
|
|
247
|
+
if match:
|
|
248
|
+
value = match.group(1).lower()
|
|
249
|
+
if value in replacements:
|
|
250
|
+
onset_times.append(replacements[value])
|
|
251
|
+
else:
|
|
252
|
+
try:
|
|
253
|
+
onset_times.append(int(value))
|
|
254
|
+
except ValueError:
|
|
255
|
+
continue
|
|
256
|
+
return onset_times
|
|
257
|
+
|
|
258
|
+
def calculate_duration(timestamp, onset_times):
|
|
259
|
+
symptom_duration = []
|
|
260
|
+
if timestamp is pd.NaT or not isinstance(onset_times, list):
|
|
261
|
+
return symptom_duration
|
|
262
|
+
for onset in onset_times:
|
|
263
|
+
if isinstance(onset, (int, float)):
|
|
264
|
+
symptom_duration.append(timestamp + pd.Timedelta(hours=onset))
|
|
265
|
+
return symptom_duration
|
|
266
|
+
|
|
267
|
+
def process_onset_data(df):
|
|
268
|
+
"""
|
|
269
|
+
Prepares the DataFrame for onset/duration plotting.
|
|
270
|
+
Adds 'timestamp', 'onset_time', and 'symptom_duration' columns.
|
|
271
|
+
"""
|
|
272
|
+
df = df.copy()
|
|
273
|
+
if 'Date' not in df.columns:
|
|
274
|
+
raise ValueError("CSV must contain a 'Date' column")
|
|
275
|
+
if 'text' not in df.columns:
|
|
276
|
+
raise ValueError("DataFrame must contain 'text' column")
|
|
277
|
+
|
|
278
|
+
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
|
|
279
|
+
df = df.dropna(subset=['Date'])
|
|
280
|
+
df['timestamp'] = df['Date']
|
|
281
|
+
df['onset_time'] = df['text'].apply(extract_onset_time)
|
|
282
|
+
df['symptom_duration'] = df.apply(lambda row: calculate_duration(row['timestamp'], row['onset_time']), axis=1)
|
|
283
|
+
return df
|