PyPI - noshot - Versions diffs - 4.0.0__py3-none-any.whl → 6.0.0__py3-none-any.whl - Mend

noshot 4.0.0py3-none-any.whl → 6.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

noshot/data/ML TS XAI/XAI/Q2.ipynb ADDED Viewed

@@ -0,0 +1,362 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "id": "29299513-721d-4214-9a2e-897ded70a9f6",
+   "metadata": {},
+   "source": [
+    "1.\tPerform minimum of ten exploratory data analysis on the following text data (use the following code to download text data) \n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "data = fetch_20newsgroups(subset='train')\n",
+    "print(data.data[0])  # first news article\n",
+    "\n",
+    "2.\tPerform a LIME-based explanation for a text classification model using the LIME Text Explainer. What insights can you draw from the explanations.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38fcd124-e405-40a0-a98c-7c3a2937bdc4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c68cc4f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df=pd.read_csv('news.csv')\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c512dc22-82f3-47d9-ab9e-207d39389922",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. Category distribution – Bar Plot\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "sns.countplot(x=df['target'])  # bar plot of target category indices\n",
+    "plt.title(\"Documents per Category\")\n",
+    "target_names=df['target'].unique()\n",
+    "plt.xticks(ticks=range(len(target_names)), labels=target_names, rotation=90)  # label x-axis with category names\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a51ec41-e1cd-4a0a-a196-147e796a8e33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2. Document length distribution – Histogram\n",
+    "doc_lengths = [len(doc.split()) for doc in df['document']]  # compute word count per document\n",
+    "sns.histplot(doc_lengths, bins=50)\n",
+    "plt.title(\"Document Length Distribution\")\n",
+    "plt.xlabel(\"Words per document\")\n",
+    "plt.ylabel(\"Frequency\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31477220-42cb-4b09-b823-5b1a15b85340",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# 3. Average document length per category – Horizontal Bar Plot\n",
+    "df.rename(columns={'document':'text','target':'category'})\n",
+    "df['doc_len'] = df['text'].apply(lambda x: len(x.split()))\n",
+    "avg_len = df.groupby('category')['doc_len'].mean().sort_values()\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "avg_len.plot(kind='barh')\n",
+    "plt.title(\"Average Document Length per Category\")\n",
+    "plt.xlabel(\"Average Word Count\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57a3d581-a786-451e-ae59-1791b6dbc892",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 4. Shortest and longest documents – Text output\n",
+    "shortest_doc = min(df['text'], key=lambda x: len(x.split()))\n",
+    "longest_doc = max(df['text'], key=lambda x: len(x.split()))\n",
+    "print(\"\\nShortest Document:\\n\", shortest_doc[:300], \"...\")\n",
+    "print(\"\\nLongest Document:\\n\", longest_doc[:300], \"...\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbeeeb8f-1b4a-434c-9090-a768c64cacc5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 5. Top 10 longest documents per category – Bar Plot\n",
+    "top_docs = df.groupby('category')['doc_len'].nlargest(10).reset_index()\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "sns.boxplot(x='category', y='doc_len', data=top_docs)\n",
+    "plt.xticks(rotation=90)\n",
+    "plt.title(\"Top 10 Longest Documents per Category\")\n",
+    "plt.ylabel(\"Word Count\")\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83b4114a-2a97-46da-901f-af73d947f672",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 6. Median document length per category – Bar Plot\n",
+    "median_len = df.groupby('category')['doc_len'].median().sort_values()\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "median_len.plot(kind='barh')\n",
+    "plt.title(\"Median Document Length per Category\")\n",
+    "plt.xlabel(\"Median Word Count\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15f7c924-46b9-4ced-81f9-a1595ebe5075",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 7. Boxplot of document lengths per category – Box Plot\n",
+    "plt.figure(figsize=(14, 6))\n",
+    "sns.boxplot(x='category', y='doc_len', data=df)\n",
+    "plt.xticks(rotation=90)\n",
+    "plt.title(\"Document Length Distribution by Category\")\n",
+    "plt.ylabel(\"Word Count\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bfa4a18-e8a9-4e08-9efc-870da7aa1f81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 8. Number of empty or very short docs – Text output\n",
+    "short_docs = df[df['doc_len'] < 5]\n",
+    "print(f\"\\nNumber of documents with less than 5 words: {len(short_docs)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ad5a6b9-9589-4b45-a4a7-29a330d82daf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 9. Bar chart of total characters per category – Bar Plot\n",
+    "df['char_len'] = df['text'].apply(len)\n",
+    "total_chars = df.groupby('category')['char_len'].sum().sort_values()\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "total_chars.plot(kind='barh')\n",
+    "plt.title(\"Total Characters per Category\")\n",
+    "plt.xlabel(\"Total Characters\")\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea3801ab-d444-4ffe-a8b6-f119f3928e3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_lengths = []\n",
+    "for text in data.data:\n",
+    "    words = text.split()\n",
+    "    word_lengths.extend([len(word) for word in words])\n",
+    "\n",
+    "plt.figure(figsize=(8, 5))\n",
+    "sns.histplot(word_lengths, bins=30)\n",
+    "plt.title(\"Distribution of Word Lengths\")\n",
+    "plt.xlabel(\"Word Length\")\n",
+    "plt.ylabel(\"Frequency\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b225dae4-c541-478b-9d53-e4150fb3820a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import lime\n",
+    "import lime.lime_text\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from lime.lime_text import LimeTextExplainer\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fcd1486-ca85-4b45-951a-59992e0a6be7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df= pd.read_csv(\"questions.csv\")\n",
+    "\n",
+    "texts= df[\"question1\"][:400]\n",
+    "labels=df[\"is_duplicate\"][:400]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4b4f174-8ea7-4536-91b5-29501b4ea88b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = TfidfVectorizer()\n",
+    "X = vectorizer.fit_transform(texts)\n",
+    "classifier = LogisticRegression()\n",
+    "classifier.fit(X, labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6fde476-0113-465d-876d-95cde0ed35b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline = make_pipeline(vectorizer, classifier)\n",
+    "\n",
+    "# LIME Explainer\n",
+    "explainer = LimeTextExplainer(class_names=[\"Negative\", \"Positive\"])\n",
+    "\n",
+    "def explain_text(text):\n",
+    "    exp = explainer.explain_instance(\n",
+    "        text, pipeline.predict_proba, num_features=5\n",
+    "    )\n",
+    "    exp.show_in_notebook(text=True)\n",
+    "    exp.save_to_file('lime_explanation.html')\n",
+    "\n",
+    "    fig = exp.as_pyplot_figure()\n",
+    "    plt.show()\n",
+    "\n",
+    "    return exp\n",
+    "\n",
+    "# Test explanation\n",
+    "sample_text = \"I really enjoyed this film, it was fantastic!\"\n",
+    "explanation = explain_text(sample_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0af8afbc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import lime\n",
+    "import lime.lime_text\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from lime.lime_text import LimeTextExplainer\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Load the dataset\n",
+    "df = pd.read_csv(\"train.csv\")\n",
+    "\n",
+    "# Preprocess the data\n",
+    "texts = df[\"question1\"].fillna('') + \" \" + df[\"question2\"].fillna('')\n",
+    "labels = df[\"is_duplicate\"]\n",
+    "\n",
+    "# Vectorize the text data\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "X = vectorizer.fit_transform(texts)\n",
+    "\n",
+    "# Train a classifier\n",
+    "classifier = LogisticRegression(max_iter=100)\n",
+    "classifier.fit(X, labels)\n",
+    "\n",
+    "# Create a pipeline\n",
+    "pipeline = make_pipeline(vectorizer, classifier)\n",
+    "\n",
+    "# Initialize LIME Explainer\n",
+    "explainer = LimeTextExplainer(class_names=[\"Not Duplicate\", \"Duplicate\"])\n",
+    "\n",
+    "def explain_text(text):\n",
+    "    exp = explainer.explain_instance(\n",
+    "        text, pipeline.predict_proba, num_features=5\n",
+    "    )\n",
+    "    exp.show_in_notebook(text=True)\n",
+    "    exp.save_to_file('lime_explanation.html')\n",
+    "\n",
+    "    fig = exp.as_pyplot_figure()\n",
+    "    plt.show()\n",
+    "\n",
+    "    return exp\n",
+    "\n",
+    "# Test explanation\n",
+    "sample_text = \"How can I improve my coding skills?\"  # Replace with any question pair\n",
+    "explanation = explain_text(sample_text)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf552f12",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

noshot 4.0.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

noshot 4.0.0py3-none-any.whl → 6.0.0py3-none-any.whl