datascience-toolkitt 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ from .analysis import (
2
+ run_kmeans,
3
+ run_apriori,
4
+ run_sentiment,
5
+ run_sna,
6
+ run_all
7
+ )
8
+
9
+ __all__ = [
10
+ "run_kmeans",
11
+ "run_apriori",
12
+ "run_sentiment",
13
+ "run_sna",
14
+ "run_all"
15
+ ]
@@ -0,0 +1,201 @@
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ import nltk
5
+ import re
6
+ from mlxtend.preprocessing import TransactionEncoder
7
+ from mlxtend.frequent_patterns import apriori, association_rules
8
+ from sklearn.cluster import KMeans
9
+ from sklearn.preprocessing import StandardScaler
10
+ from sklearn.metrics import silhouette_score, accuracy_score, classification_report, confusion_matrix
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.feature_extraction.text import CountVectorizer
13
+ from sklearn.naive_bayes import MultinomialNB
14
+ import networkx as nx
15
+
16
+
17
+ # -------------------------------------------------------------
18
+ # 1. K-MEANS CLUSTERING (Simple Version)
19
+ # -------------------------------------------------------------
20
+ def run_kmeans(dataset_path):
21
+ df = pd.read_csv(dataset_path)
22
+ X = df[['Age', 'Annual_Income_(k$)', 'Spending_Score']]
23
+ X_scaled = StandardScaler().fit_transform(X)
24
+
25
+ inertia, sil = [], []
26
+ for k in range(2, 11):
27
+ km = KMeans(n_clusters=k, random_state=42)
28
+ labels = km.fit_predict(X_scaled)
29
+ inertia.append(km.inertia_)
30
+ sil.append(silhouette_score(X_scaled, labels))
31
+
32
+ plt.plot(inertia, marker='o')
33
+ plt.title("Elbow Method")
34
+ plt.grid(True)
35
+ plt.show()
36
+
37
+ plt.plot(sil, marker='o')
38
+ plt.title("Silhouette Scores")
39
+ plt.grid(True)
40
+ plt.show()
41
+
42
+ kmeans = KMeans(n_clusters=5, random_state=42)
43
+ df['Cluster'] = kmeans.fit_predict(X_scaled)
44
+
45
+ print("\nCluster Profile (Mean Values):\n")
46
+ print(df.groupby('Cluster')[['Age', 'Annual_Income_(k$)', 'Spending_Score']].mean())
47
+
48
+ sns.scatterplot(
49
+ x=df['Annual_Income_(k$)'],
50
+ y=df['Spending_Score'],
51
+ hue=df['Cluster'],
52
+ palette='viridis',
53
+ s=80
54
+ )
55
+ plt.title("Customer Segments")
56
+ plt.grid(True)
57
+ plt.show()
58
+
59
+ sns.scatterplot(
60
+ x=X_scaled[:, 1],
61
+ y=X_scaled[:, 2],
62
+ hue=df['Cluster'],
63
+ palette='viridis',
64
+ s=80
65
+ )
66
+ plt.title("Scaled Clusters")
67
+ plt.xlabel("Scaled Income")
68
+ plt.ylabel("Scaled Score")
69
+ plt.grid(True)
70
+ plt.show()
71
+
72
+ return df
73
+
74
+
75
+ # -------------------------------------------------------------
76
+ # 2. APRIORI ANALYSIS (Simple Version)
77
+ # -------------------------------------------------------------
78
+ def run_apriori(dataset_path):
79
+ df = pd.read_csv(dataset_path, sep=';', on_bad_lines='skip')
80
+ df = df.dropna(subset=['CustomerID'])
81
+ df['BillNo'] = df['BillNo'].astype(str)
82
+ df = df[~df['BillNo'].str.contains('C')]
83
+ df['Itemname'] = df['Itemname'].str.strip()
84
+
85
+ transactions = df.groupby('BillNo')['Itemname'].apply(list).tolist()
86
+
87
+ te = TransactionEncoder()
88
+ df_enc = pd.DataFrame(te.fit(transactions).transform(transactions), columns=te.columns_)
89
+
90
+ itemsets = apriori(df_enc, min_support=0.01, use_colnames=True)
91
+ print("\nFrequent Itemsets:\n", itemsets)
92
+
93
+ rules = association_rules(itemsets, metric="confidence", min_threshold=0.5)
94
+ print("\nAssociation Rules:\n", rules)
95
+
96
+ return itemsets, rules
97
+
98
+
99
+ # -------------------------------------------------------------
100
+ # 3. SENTIMENT ANALYSIS (Simple Version)
101
+ # -------------------------------------------------------------
102
+ def run_sentiment(dataset_path):
103
+ df = pd.read_csv(dataset_path)
104
+ df = df.dropna(subset=['reviews.text', 'reviews.rating'])
105
+
106
+ df['full_review'] = df['reviews.title'].fillna('') + " " + df['reviews.text']
107
+
108
+ df['sentiment'] = df['reviews.rating'].apply(
109
+ lambda r: 'positive' if r >= 4 else ('neutral' if r == 3 else 'negative')
110
+ )
111
+
112
+ nltk.download('stopwords')
113
+ stop_words = set(nltk.corpus.stopwords.words('english'))
114
+
115
+ def clean_text(t):
116
+ t = re.sub('[^a-zA-Z]', ' ', str(t)).lower()
117
+ words = t.split()
118
+ return " ".join(w for w in words if w not in stop_words)
119
+
120
+ df['clean'] = df['full_review'].apply(clean_text)
121
+
122
+ X_train, X_test, y_train, y_test = train_test_split(
123
+ df['clean'],
124
+ df['sentiment'],
125
+ test_size=0.2,
126
+ random_state=42,
127
+ stratify=df['sentiment']
128
+ )
129
+
130
+ cv = CountVectorizer()
131
+ X_train_vec = cv.fit_transform(X_train)
132
+ X_test_vec = cv.transform(X_test)
133
+
134
+ model = MultinomialNB()
135
+ model.fit(X_train_vec, y_train)
136
+
137
+ y_pred = model.predict(X_test_vec)
138
+
139
+ print("Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
140
+ print("\nClassification Report:\n", classification_report(y_test, y_pred))
141
+
142
+ cm = confusion_matrix(y_test, y_pred,
143
+ labels=['negative', 'neutral', 'positive'])
144
+
145
+ sns.heatmap(
146
+ cm, annot=True, cmap='Blues', fmt='d',
147
+ xticklabels=['negative', 'neutral', 'positive'],
148
+ yticklabels=['negative', 'neutral', 'positive']
149
+ )
150
+ plt.title("Confusion Matrix")
151
+ plt.show()
152
+
153
+ return model, cv
154
+
155
+
156
+ # -------------------------------------------------------------
157
+ # 4. SOCIAL NETWORK ANALYSIS (Simple Version)
158
+ # -------------------------------------------------------------
159
+ def run_sna(dataset_path):
160
+ df = pd.read_csv(dataset_path, sep=' ', names=['user_1', 'user_2'])
161
+
162
+ G = nx.from_pandas_edgelist(df, 'user_1', 'user_2')
163
+
164
+ degree = dict(G.degree())
165
+
166
+ k = min(200, G.number_of_nodes())
167
+ betweenness = nx.betweenness_centrality(G, k=k, seed=42)
168
+ closeness = nx.closeness_centrality(G)
169
+
170
+ print("\nTop 5 by Degree Centrality:")
171
+ for u, s in sorted(degree.items(), key=lambda x: x[1], reverse=True)[:5]:
172
+ print(f"User {u}: {s}")
173
+
174
+ print("\nTop 5 by Betweenness:")
175
+ for u, s in sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:5]:
176
+ print(f"User {u}: {s:.4f}")
177
+
178
+ print("\nTop 5 by Closeness:")
179
+ for u, s in sorted(closeness.items(), key=lambda x: x[1], reverse=True)[:5]:
180
+ print(f"User {u}: {s:.4f}")
181
+
182
+ return degree, betweenness, closeness
183
+
184
+
185
+ # -------------------------------------------------------------
186
+ # 5. RUN ALL MODULES
187
+ # -------------------------------------------------------------
188
+ def run_all(kmeans_file, apriori_file, sentiment_file, sna_file):
189
+ print("\n=== K-MEANS CLUSTERING ===")
190
+ run_kmeans(kmeans_file)
191
+
192
+ print("\n=== APRIORI ===")
193
+ run_apriori(apriori_file)
194
+
195
+ print("\n=== SENTIMENT ANALYSIS ===")
196
+ run_sentiment(sentiment_file)
197
+
198
+ print("\n=== SOCIAL NETWORK ANALYSIS ===")
199
+ run_sna(sna_file)
200
+
201
+ print("\nALL ANALYSIS COMPLETED.")
@@ -0,0 +1,25 @@
1
+ def run_apriori(dataset_path):
2
+ import pandas as pd
3
+ from mlxtend.preprocessing import TransactionEncoder
4
+ from mlxtend.frequent_patterns import apriori, association_rules
5
+
6
+ df = pd.read_csv(dataset_path, sep=';', on_bad_lines='skip')
7
+ df = df.dropna(subset=['CustomerID'])
8
+ df['BillNo'] = df['BillNo'].astype(str)
9
+ df = df[~df['BillNo'].str.contains('C')]
10
+ df['Itemname'] = df['Itemname'].str.strip()
11
+
12
+ transactions = df.groupby('BillNo')['Itemname'].apply(list).tolist()
13
+
14
+ te = TransactionEncoder()
15
+ df_enc = pd.DataFrame(te.fit(transactions).transform(transactions),
16
+ columns=te.columns_)
17
+
18
+ itemsets = apriori(df_enc, min_support=0.01, use_colnames=True)
19
+ print("\nFrequent Itemsets:\n", itemsets)
20
+
21
+ rules = association_rules(itemsets, metric="confidence",
22
+ min_threshold=0.5)
23
+ print("\nAssociation Rules:\n", rules)
24
+
25
+ return itemsets, rules
@@ -0,0 +1,52 @@
1
+ def run_kmeans(dataset_path):
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ from sklearn.cluster import KMeans
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.metrics import silhouette_score
8
+
9
+ df = pd.read_csv(dataset_path)
10
+ X = df[['Age','Annual_Income_(k$)','Spending_Score']]
11
+ X_scaled = StandardScaler().fit_transform(X)
12
+
13
+ inertia, sil = [], []
14
+ for k in range(2, 11):
15
+ km = KMeans(n_clusters=k, random_state=42)
16
+ labels = km.fit_predict(X_scaled)
17
+ inertia.append(km.inertia_)
18
+ sil.append(silhouette_score(X_scaled, labels))
19
+
20
+ plt.figure(figsize=(10,5))
21
+ plt.plot(inertia, marker='o')
22
+ plt.title("Elbow Method")
23
+ plt.grid(True)
24
+ plt.show()
25
+
26
+ plt.figure(figsize=(10,5))
27
+ plt.plot(sil, marker='o')
28
+ plt.title("Silhouette Scores")
29
+ plt.grid(True)
30
+ plt.show()
31
+
32
+ kmeans = KMeans(n_clusters=5, random_state=42)
33
+ df['Cluster'] = kmeans.fit_predict(X_scaled)
34
+
35
+ print("\nCluster Profile (Mean Values):\n")
36
+ print(df.groupby('Cluster')[['Age','Annual_Income_(k$)','Spending_Score']].mean())
37
+
38
+ sns.scatterplot(x=df['Annual_Income_(k$)'], y=df['Spending_Score'],
39
+ hue=df['Cluster'], palette='viridis', s=80)
40
+ plt.title("Customer Segments")
41
+ plt.grid(True)
42
+ plt.show()
43
+
44
+ sns.scatterplot(x=X_scaled[:,1], y=X_scaled[:,2],
45
+ hue=df['Cluster'], palette='viridis', s=80)
46
+ plt.title("Scaled Clusters")
47
+ plt.xlabel("Scaled Income")
48
+ plt.ylabel("Scaled Score")
49
+ plt.grid(True)
50
+ plt.show()
51
+
52
+ return df
@@ -0,0 +1,57 @@
1
+ def run_sentiment(dataset_path):
2
+ import pandas as pd
3
+ import re
4
+ import nltk
5
+ from nltk.corpus import stopwords
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.feature_extraction.text import CountVectorizer
8
+ from sklearn.naive_bayes import MultinomialNB
9
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
10
+ import seaborn as sns
11
+ import matplotlib.pyplot as plt
12
+
13
+ df = pd.read_csv(dataset_path)
14
+ df = df.dropna(subset=['reviews.text', 'reviews.rating'])
15
+
16
+ df['full_review'] = df['reviews.title'].fillna('') + ' ' + df['reviews.text']
17
+
18
+ df['sentiment'] = df['reviews.rating'].apply(
19
+ lambda r: 'positive' if r >= 4 else ('neutral' if r == 3 else 'negative')
20
+ )
21
+
22
+ nltk.download('stopwords')
23
+ stop_words = set(stopwords.words('english'))
24
+
25
+ def clean(t):
26
+ t = re.sub(r'[^a-z\s]', '', str(t).lower())
27
+ return ' '.join([w for w in t.split() if w not in stop_words])
28
+
29
+ df['clean'] = df['full_review'].apply(clean)
30
+
31
+ X_train, X_test, y_train, y_test = train_test_split(
32
+ df['clean'], df['sentiment'], test_size=0.2,
33
+ random_state=42, stratify=df['sentiment']
34
+ )
35
+
36
+ cv = CountVectorizer()
37
+ X_train_vec = cv.fit_transform(X_train)
38
+ X_test_vec = cv.transform(X_test)
39
+
40
+ model = MultinomialNB()
41
+ model.fit(X_train_vec, y_train)
42
+
43
+ y_pred = model.predict(X_test_vec)
44
+
45
+ print("Accuracy:", round(accuracy_score(y_test, y_pred)*100, 2), "%")
46
+ print("\nClassification Report:\n", classification_report(y_test, y_pred))
47
+
48
+ cm = confusion_matrix(y_test, y_pred,
49
+ labels=['negative','neutral','positive'])
50
+
51
+ sns.heatmap(cm, annot=True, cmap='Blues', fmt='d',
52
+ xticklabels=['negative','neutral','positive'],
53
+ yticklabels=['negative','neutral','positive'])
54
+ plt.title("Confusion Matrix")
55
+ plt.show()
56
+
57
+ return model, cv
@@ -0,0 +1,26 @@
1
+ def run_sna(dataset_path):
2
+ import pandas as pd
3
+ import networkx as nx
4
+
5
+ df = pd.read_csv(dataset_path, sep=' ', names=['user_1','user_2'])
6
+ G = nx.from_pandas_edgelist(df, 'user_1', 'user_2')
7
+
8
+ degree = dict(G.degree())
9
+
10
+ k = min(200, G.number_of_nodes())
11
+ betweenness = nx.betweenness_centrality(G, k=k, seed=42)
12
+ closeness = nx.closeness_centrality(G)
13
+
14
+ print("\nTop 5 by Degree Centrality:")
15
+ for u, s in sorted(degree.items(), key=lambda x: x[1], reverse=True)[:5]:
16
+ print(f"User {u}: {s}")
17
+
18
+ print("\nTop 5 by Betweenness:")
19
+ for u, s in sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:5]:
20
+ print(f"User {u}: {s:.4f}")
21
+
22
+ print("\nTop 5 by Closeness:")
23
+ for u, s in sorted(closeness.items(), key=lambda x: x[1], reverse=True)[:5]:
24
+ print(f"User {u}: {s:.4f}")
25
+
26
+ return degree, betweenness, closeness
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: datascience_toolkitt
3
+ Version: 1.0.7
4
+ Summary: All assignment programs packaged into one toolkit
5
+ Home-page: https://github.com/Aniudupa15/datascience_toolkit
6
+ Author: Anirudha
7
+ Author-email: your.email@example.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: pandas
16
+ Requires-Dist: numpy
17
+ Requires-Dist: matplotlib
18
+ Requires-Dist: seaborn
19
+ Requires-Dist: scikit-learn
20
+ Requires-Dist: mlxtend
21
+ Requires-Dist: nltk
22
+ Requires-Dist: networkx
23
+ Dynamic: author
24
+ Dynamic: license-file
25
+ Dynamic: requires-dist
26
+ Dynamic: requires-python
27
+ Dynamic: summary
28
+
29
+ # datascience_toolkit
30
+
31
+ Usage examples...
32
+
33
+ ![package-screenshot](/mnt/data/12985101-b4ed-41f4-9a7e-acdb56f5e45c.png)
@@ -0,0 +1,11 @@
1
+ datascience_toolkitt/__init__.py,sha256=WCBcGgSRGQdBG_HxLYDmK8-LMrgdvBX74Thvgq6zvnY,205
2
+ datascience_toolkitt/analysis.py,sha256=0iBW9_89m6iHKj__dyN4WSIgfjA7dO8Z2APdcOCKl9o,6357
3
+ datascience_toolkitt/apriori_analysis.py,sha256=nErEAziky2xh2Oj9jghRr4bIA6bgpXwx_I5LMeIgSpQ,947
4
+ datascience_toolkitt/clustering.py,sha256=2Nyi0LZA0n4kOC5-ZBRscWC0dC-0QHpQ7q6UoRH-3iI,1606
5
+ datascience_toolkitt/sentiment_analysis.py,sha256=WNRjT_iPF36ZJQlHuudA7AyjAVrBwfyG_fpD6nmmtJo,1939
6
+ datascience_toolkitt/social_network.py,sha256=GtFhR6nNBHSRErTo0nVDJD0C0OkGzeOc23STsGuPEg8,886
7
+ datascience_toolkitt-1.0.7.dist-info/licenses/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ datascience_toolkitt-1.0.7.dist-info/METADATA,sha256=NDodgaUjRXKAaBljYpt2K2_8MkQ_XvCYiCfhFt_ssag,889
9
+ datascience_toolkitt-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
+ datascience_toolkitt-1.0.7.dist-info/top_level.txt,sha256=NLR8AnmSyM3zq7NNDda5UjcBCDApUSn4R__tYLS0iR4,21
11
+ datascience_toolkitt-1.0.7.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
File without changes
@@ -0,0 +1 @@
1
+ datascience_toolkitt