noshot 4.0.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. noshot/data/ML TS XAI/XAI/Q1.ipynb +377 -0
  2. noshot/data/ML TS XAI/XAI/Q2.ipynb +362 -0
  3. noshot/data/ML TS XAI/XAI/Q3.ipynb +637 -0
  4. noshot/data/ML TS XAI/XAI/Q4.ipynb +206 -0
  5. noshot/data/ML TS XAI/XAI/Q5.ipynb +1018 -0
  6. {noshot-4.0.0.dist-info → noshot-6.0.0.dist-info}/METADATA +1 -1
  7. noshot-6.0.0.dist-info/RECORD +14 -0
  8. noshot/data/ML TS XAI/ML/ML Lab CIA 2 (I Found Only This Check)/Copy_of_Pistachio_csv.ipynb +0 -269
  9. noshot/data/ML TS XAI/ML/ML Lab CIA 2 (I Found Only This Check)/weatherAUS.ipynb +0 -155
  10. noshot/data/ML TS XAI/ML/Main/1. EDA-PCA (Balance Scale Dataset).ipynb +0 -139
  11. noshot/data/ML TS XAI/ML/Main/1. EDA-PCA (Rice Dataset).ipynb +0 -181
  12. noshot/data/ML TS XAI/ML/Main/10. HMM Veterbi.ipynb +0 -228
  13. noshot/data/ML TS XAI/ML/Main/2. KNN (Balance Scale Dataset).ipynb +0 -117
  14. noshot/data/ML TS XAI/ML/Main/2. KNN (Iris Dataset).ipynb +0 -165
  15. noshot/data/ML TS XAI/ML/Main/2. KNN (Sobar-72 Dataset).ipynb +0 -251
  16. noshot/data/ML TS XAI/ML/Main/3. LDA (Balance Scale Dataset).ipynb +0 -78
  17. noshot/data/ML TS XAI/ML/Main/3. LDA (NPHA Doctor Visits Dataset).ipynb +0 -114
  18. noshot/data/ML TS XAI/ML/Main/4. Linear Regression (Machine Dataset).ipynb +0 -115
  19. noshot/data/ML TS XAI/ML/Main/4. Linear Regression (Real Estate Dataset).ipynb +0 -159
  20. noshot/data/ML TS XAI/ML/Main/5. Logistic Regression (Magic04 Dataset).ipynb +0 -200
  21. noshot/data/ML TS XAI/ML/Main/5. Logistic Regression (Wine Dataset).ipynb +0 -112
  22. noshot/data/ML TS XAI/ML/Main/6. Naive Bayes Classifier (Agaricus Lepiota Dataset).ipynb +0 -153
  23. noshot/data/ML TS XAI/ML/Main/6. Naive Bayes Classifier (Wine Dataset).ipynb +0 -89
  24. noshot/data/ML TS XAI/ML/Main/7. SVM (Rice Dataset).ipynb +0 -208
  25. noshot/data/ML TS XAI/ML/Main/8. FeedForward NN (Sobar72 Dataset).ipynb +0 -260
  26. noshot/data/ML TS XAI/ML/Main/9. CNN (Cifar10 Dataset).ipynb +0 -238
  27. noshot/data/ML TS XAI/ML/Main/data/agaricus-lepiota.data +0 -8124
  28. noshot/data/ML TS XAI/ML/Main/data/balance-scale.txt +0 -625
  29. noshot/data/ML TS XAI/ML/Main/data/doctor-visits.csv +0 -715
  30. noshot/data/ML TS XAI/ML/Main/data/iris.csv +0 -151
  31. noshot/data/ML TS XAI/ML/Main/data/machine-data.csv +0 -210
  32. noshot/data/ML TS XAI/ML/Main/data/magic04.data +0 -19020
  33. noshot/data/ML TS XAI/ML/Main/data/real-estate.xlsx +0 -0
  34. noshot/data/ML TS XAI/ML/Main/data/rice.arff +0 -3826
  35. noshot/data/ML TS XAI/ML/Main/data/sobar-72.csv +0 -73
  36. noshot/data/ML TS XAI/ML/Main/data/wine-dataset.csv +0 -179
  37. noshot/data/ML TS XAI/ML/Other Codes.ipynb +0 -158
  38. noshot/data/ML TS XAI/ML/Rolls Royce AllinOne.ipynb +0 -691
  39. noshot-4.0.0.dist-info/RECORD +0 -40
  40. {noshot-4.0.0.dist-info → noshot-6.0.0.dist-info}/WHEEL +0 -0
  41. {noshot-4.0.0.dist-info → noshot-6.0.0.dist-info}/licenses/LICENSE.txt +0 -0
  42. {noshot-4.0.0.dist-info → noshot-6.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,362 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "raw",
5
+ "id": "29299513-721d-4214-9a2e-897ded70a9f6",
6
+ "metadata": {},
7
+ "source": [
8
+ "1.\tPerform minimum of ten exploratory data analysis on the following text data (use the following code to download text data) \n",
9
+ "from sklearn.datasets import fetch_20newsgroups\n",
10
+ "data = fetch_20newsgroups(subset='train')\n",
11
+ "print(data.data[0]) # first news article\n",
12
+ "\n",
13
+ "2.\tPerform a LIME-based explanation for a text classification model using the LIME Text Explainer. What insights can you draw from the explanations.\n"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "id": "38fcd124-e405-40a0-a98c-7c3a2937bdc4",
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "import matplotlib.pyplot as plt\n",
24
+ "import seaborn as sns\n",
25
+ "import pandas as pd"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": null,
31
+ "id": "c68cc4f4",
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "df=pd.read_csv('news.csv')\n",
36
+ "df.head()"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": null,
42
+ "id": "c512dc22-82f3-47d9-ab9e-207d39389922",
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "# 1. Category distribution – Bar Plot\n",
47
+ "plt.figure(figsize=(12, 6))\n",
48
+ "sns.countplot(x=df['target']) # bar plot of target category indices\n",
49
+ "plt.title(\"Documents per Category\")\n",
50
+ "target_names=df['target'].unique()\n",
51
+ "plt.xticks(ticks=range(len(target_names)), labels=target_names, rotation=90) # label x-axis with category names\n",
52
+ "plt.tight_layout()\n",
53
+ "plt.show()\n"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": null,
59
+ "id": "1a51ec41-e1cd-4a0a-a196-147e796a8e33",
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "# 2. Document length distribution – Histogram\n",
64
+ "doc_lengths = [len(doc.split()) for doc in df['document']] # compute word count per document\n",
65
+ "sns.histplot(doc_lengths, bins=50)\n",
66
+ "plt.title(\"Document Length Distribution\")\n",
67
+ "plt.xlabel(\"Words per document\")\n",
68
+ "plt.ylabel(\"Frequency\")\n",
69
+ "plt.show()"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "id": "31477220-42cb-4b09-b823-5b1a15b85340",
76
+ "metadata": {
77
+ "scrolled": true
78
+ },
79
+ "outputs": [],
80
+ "source": [
81
+ "# 3. Average document length per category – Horizontal Bar Plot\n",
82
+ "df.rename(columns={'document':'text','target':'category'})\n",
83
+ "df['doc_len'] = df['text'].apply(lambda x: len(x.split()))\n",
84
+ "avg_len = df.groupby('category')['doc_len'].mean().sort_values()\n",
85
+ "plt.figure(figsize=(12, 6))\n",
86
+ "avg_len.plot(kind='barh')\n",
87
+ "plt.title(\"Average Document Length per Category\")\n",
88
+ "plt.xlabel(\"Average Word Count\")\n",
89
+ "plt.show()"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "id": "57a3d581-a786-451e-ae59-1791b6dbc892",
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "# 4. Shortest and longest documents – Text output\n",
100
+ "shortest_doc = min(df['text'], key=lambda x: len(x.split()))\n",
101
+ "longest_doc = max(df['text'], key=lambda x: len(x.split()))\n",
102
+ "print(\"\\nShortest Document:\\n\", shortest_doc[:300], \"...\")\n",
103
+ "print(\"\\nLongest Document:\\n\", longest_doc[:300], \"...\")\n"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "id": "cbeeeb8f-1b4a-434c-9090-a768c64cacc5",
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "# 5. Top 10 longest documents per category – Bar Plot\n",
114
+ "top_docs = df.groupby('category')['doc_len'].nlargest(10).reset_index()\n",
115
+ "plt.figure(figsize=(12, 6))\n",
116
+ "sns.boxplot(x='category', y='doc_len', data=top_docs)\n",
117
+ "plt.xticks(rotation=90)\n",
118
+ "plt.title(\"Top 10 Longest Documents per Category\")\n",
119
+ "plt.ylabel(\"Word Count\")\n",
120
+ "plt.show()\n"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "id": "83b4114a-2a97-46da-901f-af73d947f672",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "# 6. Median document length per category – Bar Plot\n",
131
+ "median_len = df.groupby('category')['doc_len'].median().sort_values()\n",
132
+ "plt.figure(figsize=(12, 6))\n",
133
+ "median_len.plot(kind='barh')\n",
134
+ "plt.title(\"Median Document Length per Category\")\n",
135
+ "plt.xlabel(\"Median Word Count\")\n",
136
+ "plt.show()"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
+ "id": "15f7c924-46b9-4ced-81f9-a1595ebe5075",
143
+ "metadata": {},
144
+ "outputs": [],
145
+ "source": [
146
+ "# 7. Boxplot of document lengths per category – Box Plot\n",
147
+ "plt.figure(figsize=(14, 6))\n",
148
+ "sns.boxplot(x='category', y='doc_len', data=df)\n",
149
+ "plt.xticks(rotation=90)\n",
150
+ "plt.title(\"Document Length Distribution by Category\")\n",
151
+ "plt.ylabel(\"Word Count\")\n",
152
+ "plt.tight_layout()\n",
153
+ "plt.show()"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": null,
159
+ "id": "4bfa4a18-e8a9-4e08-9efc-870da7aa1f81",
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": [
163
+ "# 8. Number of empty or very short docs – Text output\n",
164
+ "short_docs = df[df['doc_len'] < 5]\n",
165
+ "print(f\"\\nNumber of documents with less than 5 words: {len(short_docs)}\")"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "id": "2ad5a6b9-9589-4b45-a4a7-29a330d82daf",
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "# 9. Bar chart of total characters per category – Bar Plot\n",
176
+ "df['char_len'] = df['text'].apply(len)\n",
177
+ "total_chars = df.groupby('category')['char_len'].sum().sort_values()\n",
178
+ "plt.figure(figsize=(12, 6))\n",
179
+ "total_chars.plot(kind='barh')\n",
180
+ "plt.title(\"Total Characters per Category\")\n",
181
+ "plt.xlabel(\"Total Characters\")\n",
182
+ "plt.show()\n"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "id": "ea3801ab-d444-4ffe-a8b6-f119f3928e3c",
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": [
192
+ "word_lengths = []\n",
193
+ "for text in data.data:\n",
194
+ " words = text.split()\n",
195
+ " word_lengths.extend([len(word) for word in words])\n",
196
+ "\n",
197
+ "plt.figure(figsize=(8, 5))\n",
198
+ "sns.histplot(word_lengths, bins=30)\n",
199
+ "plt.title(\"Distribution of Word Lengths\")\n",
200
+ "plt.xlabel(\"Word Length\")\n",
201
+ "plt.ylabel(\"Frequency\")\n",
202
+ "plt.show()"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "id": "b225dae4-c541-478b-9d53-e4150fb3820a",
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": [
212
+ "import numpy as np\n",
213
+ "import lime\n",
214
+ "import lime.lime_text\n",
215
+ "from sklearn.pipeline import make_pipeline\n",
216
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
217
+ "from sklearn.linear_model import LogisticRegression\n",
218
+ "from lime.lime_text import LimeTextExplainer\n",
219
+ "import matplotlib.pyplot as plt"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": null,
225
+ "id": "0fcd1486-ca85-4b45-951a-59992e0a6be7",
226
+ "metadata": {},
227
+ "outputs": [],
228
+ "source": [
229
+ "df= pd.read_csv(\"questions.csv\")\n",
230
+ "\n",
231
+ "texts= df[\"question1\"][:400]\n",
232
+ "labels=df[\"is_duplicate\"][:400]"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": null,
238
+ "id": "d4b4f174-8ea7-4536-91b5-29501b4ea88b",
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": [
242
+ "vectorizer = TfidfVectorizer()\n",
243
+ "X = vectorizer.fit_transform(texts)\n",
244
+ "classifier = LogisticRegression()\n",
245
+ "classifier.fit(X, labels)"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": null,
251
+ "id": "e6fde476-0113-465d-876d-95cde0ed35b3",
252
+ "metadata": {},
253
+ "outputs": [],
254
+ "source": [
255
+ "pipeline = make_pipeline(vectorizer, classifier)\n",
256
+ "\n",
257
+ "# LIME Explainer\n",
258
+ "explainer = LimeTextExplainer(class_names=[\"Negative\", \"Positive\"])\n",
259
+ "\n",
260
+ "def explain_text(text):\n",
261
+ " exp = explainer.explain_instance(\n",
262
+ " text, pipeline.predict_proba, num_features=5\n",
263
+ " )\n",
264
+ " exp.show_in_notebook(text=True)\n",
265
+ " exp.save_to_file('lime_explanation.html')\n",
266
+ "\n",
267
+ " fig = exp.as_pyplot_figure()\n",
268
+ " plt.show()\n",
269
+ "\n",
270
+ " return exp\n",
271
+ "\n",
272
+ "# Test explanation\n",
273
+ "sample_text = \"I really enjoyed this film, it was fantastic!\"\n",
274
+ "explanation = explain_text(sample_text)"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "0af8afbc",
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": [
284
+ "import pandas as pd\n",
285
+ "import numpy as np\n",
286
+ "import lime\n",
287
+ "import lime.lime_text\n",
288
+ "from sklearn.pipeline import make_pipeline\n",
289
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
290
+ "from sklearn.linear_model import LogisticRegression\n",
291
+ "from lime.lime_text import LimeTextExplainer\n",
292
+ "import matplotlib.pyplot as plt\n",
293
+ "\n",
294
+ "# Load the dataset\n",
295
+ "df = pd.read_csv(\"train.csv\")\n",
296
+ "\n",
297
+ "# Preprocess the data\n",
298
+ "texts = df[\"question1\"].fillna('') + \" \" + df[\"question2\"].fillna('')\n",
299
+ "labels = df[\"is_duplicate\"]\n",
300
+ "\n",
301
+ "# Vectorize the text data\n",
302
+ "vectorizer = TfidfVectorizer()\n",
303
+ "X = vectorizer.fit_transform(texts)\n",
304
+ "\n",
305
+ "# Train a classifier\n",
306
+ "classifier = LogisticRegression(max_iter=100)\n",
307
+ "classifier.fit(X, labels)\n",
308
+ "\n",
309
+ "# Create a pipeline\n",
310
+ "pipeline = make_pipeline(vectorizer, classifier)\n",
311
+ "\n",
312
+ "# Initialize LIME Explainer\n",
313
+ "explainer = LimeTextExplainer(class_names=[\"Not Duplicate\", \"Duplicate\"])\n",
314
+ "\n",
315
+ "def explain_text(text):\n",
316
+ " exp = explainer.explain_instance(\n",
317
+ " text, pipeline.predict_proba, num_features=5\n",
318
+ " )\n",
319
+ " exp.show_in_notebook(text=True)\n",
320
+ " exp.save_to_file('lime_explanation.html')\n",
321
+ "\n",
322
+ " fig = exp.as_pyplot_figure()\n",
323
+ " plt.show()\n",
324
+ "\n",
325
+ " return exp\n",
326
+ "\n",
327
+ "# Test explanation\n",
328
+ "sample_text = \"How can I improve my coding skills?\" # Replace with any question pair\n",
329
+ "explanation = explain_text(sample_text)\n"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": null,
335
+ "id": "cf552f12",
336
+ "metadata": {},
337
+ "outputs": [],
338
+ "source": []
339
+ }
340
+ ],
341
+ "metadata": {
342
+ "kernelspec": {
343
+ "display_name": "Python 3 (ipykernel)",
344
+ "language": "python",
345
+ "name": "python3"
346
+ },
347
+ "language_info": {
348
+ "codemirror_mode": {
349
+ "name": "ipython",
350
+ "version": 3
351
+ },
352
+ "file_extension": ".py",
353
+ "mimetype": "text/x-python",
354
+ "name": "python",
355
+ "nbconvert_exporter": "python",
356
+ "pygments_lexer": "ipython3",
357
+ "version": "3.12.4"
358
+ }
359
+ },
360
+ "nbformat": 4,
361
+ "nbformat_minor": 5
362
+ }