PyPI - noshot - Versions diffs - 0.1.0__py3-none-any.whl - Mend

noshot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

noshot/data/AIDS CN NLP/NLP/NLP 3/word2vector.ipynb ADDED Viewed

@@ -0,0 +1,173 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "922063be-4bfd-4f18-b051-fff55cb49b29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#preprocessed_text = \"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4550cf0a-3517-455e-b386-717e4e030f49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "tfidf_vectorizer=TfidfVectorizer()\n",
+    "X_tfidf=tfidf_vectorizer.fit_transform([preprocessed_text])\n",
+    "tfidf_vocabulary=tfidf_vectorizer.get_feature_names_out()\n",
+    "tfidf_array=X_tfidf.toarray()\n",
+    "print(\"TF-IDF Vocabulary:\",tfidf_vocabulary)\n",
+    "print(\"\\nTF-IDF Array:\",tfidf_array)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16a77ab0-21d3-424a-9d11-3b7fc5005acc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "from gensim.models import Word2Vec\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "# Load the dataset\n",
+    "file_path = 'text3.txt'\n",
+    "with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "    text = file.read()\n",
+    "\n",
+    "# Preprocessing: convert to lowercase, remove special characters, and tokenize\n",
+    "def preprocess(text):\n",
+    "    text = text.lower()\n",
+    "    text = re.sub(r'[^a-zA-Z\\s]', '', text)  # Remove special characters and numbers\n",
+    "    tokens = word_tokenize(text)  # Tokenize the text\n",
+    "    return tokens\n",
+    "\n",
+    "# Tokenize and preprocess the dataset\n",
+    "tokens = preprocess(text)\n",
+    "\n",
+    "# Train Word2Vec model\n",
+    "model = Word2Vec(sentences=[tokens], vector_size=100, window=5, min_count=1, sg=0)  # sg=0 for CBOW\n",
+    "\n",
+    "# Get the list of all unique words in the vocabulary\n",
+    "vocab = list(model.wv.index_to_key)\n",
+    "\n",
+    "# Initialize a matrix to store similarity scores\n",
+    "similarity_matrix = np.zeros((len(vocab), len(vocab)))\n",
+    "\n",
+    "# Compute pairwise similarity for all words in the vocabulary\n",
+    "for i, word1 in enumerate(vocab):\n",
+    "    for j, word2 in enumerate(vocab):\n",
+    "        similarity_matrix[i, j] = model.wv.similarity(word1, word2)\n",
+    "\n",
+    "# Convert similarity matrix into a pandas DataFrame for easy visualization\n",
+    "similarity_df = pd.DataFrame(similarity_matrix, index=vocab, columns=vocab)\n",
+    "\n",
+    "# Display the first few rows of the similarity matrix\n",
+    "print(similarity_df.head())\n",
+    "\n",
+    "# Plot heatmap of the similarity matrix\n",
+    "plt.figure(figsize=(20, 40))\n",
+    "sns.heatmap(similarity_df, cmap=\"coolwarm\", annot=False, xticklabels=True, yticklabels=True)\n",
+    "plt.title(\"Semantic Similarity Between Words\")\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6913d2e-3734-41b4-9285-6ac1afa2b1ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "from gensim.models import Word2Vec\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "# Load the dataset\n",
+    "file_path = 'story.txt'\n",
+    "with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "    text = file.read()\n",
+    "\n",
+    "# Preprocessing: convert to lowercase, remove special characters, and tokenize\n",
+    "def preprocess(text):\n",
+    "    text = text.lower()\n",
+    "    text = re.sub(r'[^a-zA-Z\\s]', '', text)  # Remove special characters and numbers\n",
+    "    tokens = word_tokenize(text)  # Tokenize the text\n",
+    "    return tokens\n",
+    "\n",
+    "# Tokenize and preprocess the dataset\n",
+    "tokens = preprocess(text)\n",
+    "\n",
+    "# Train Word2Vec model\n",
+    "model = Word2Vec(sentences=[tokens], vector_size=100, window=5, min_count=1, sg=0)  # sg=0 for CBOW\n",
+    "\n",
+    "# Get the list of all unique words in the vocabulary\n",
+    "vocab = list(model.wv.index_to_key)\n",
+    "\n",
+    "# Initialize a matrix to store similarity scores\n",
+    "similarity_matrix = np.zeros((len(vocab), len(vocab)))\n",
+    "\n",
+    "# Compute pairwise similarity for all words in the vocabulary\n",
+    "for i, word1 in enumerate(vocab):\n",
+    "    for j, word2 in enumerate(vocab):\n",
+    "        similarity_matrix[i, j] = model.wv.similarity(word1, word2)\n",
+    "\n",
+    "# Convert similarity matrix into a pandas DataFrame for easy visualization\n",
+    "similarity_df = pd.DataFrame(similarity_matrix, index=vocab, columns=vocab)\n",
+    "\n",
+    "# Display the first few rows of the similarity matrix\n",
+    "print(similarity_df.head())\n",
+    "\n",
+    "# Plot heatmap of the similarity matrix\n",
+    "plt.figure(figsize=(45,40))\n",
+    "sns.heatmap(similarity_df, cmap=\"coolwarm\", annot=False, xticklabels=True, yticklabels=True)\n",
+    "plt.title(\"Semantic Similarity Between Words\")\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5676923-468a-4b02-8f57-36633330ccb8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

noshot/data/AIDS CN NLP/NLP/NLP 4/Backward Procedure Algorithm.ipynb ADDED Viewed

@@ -0,0 +1,179 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "gVbwDSwfR3CY",
+        "outputId": "84d47a98-d4d4-4b51-a679-65654164125c"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+            "[nltk_data]   Package stopwords is already up-to-date!\n",
+            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+            "[nltk_data]   Package punkt is already up-to-date!\n",
+            "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
+            "[nltk_data]     /root/nltk_data...\n",
+            "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
+            "[nltk_data]       date!\n"
+          ]
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "True"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 43
+        }
+      ],
+      "source": [
+        "import nltk\n",
+        "nltk.download('stopwords')\n",
+        "nltk.download('punkt')\n",
+        "from nltk.corpus import stopwords\n",
+        "from nltk import pos_tag,word_tokenize\n",
+        "import pandas as pd\n",
+        "import math\n",
+        "nltk.download('averaged_perceptron_tagger')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def bpa(pi,state,transition,words):\n",
+        "    words=list(reversed(words))\n",
+        "    for i in range(len(state)):\n",
+        "      beta.update({i:{len(words)+1:1}})\n",
+        "    #ok=-1\n",
+        "\n",
+        "    for t in reversed(range(1,len(words)+1)):\n",
+        "        #temp=list()\n",
+        "        #ok=ok+1\n",
+        "        for i in range(0,len(state)):\n",
+        "            val=0\n",
+        "            for j in range(0,len(state)):\n",
+        "                ok=words[(len(words)+1)-(t+1)]\n",
+        "                val=val+(state[i][j]*transition[i][ok]*beta[j][t+1])\n",
+        "                print(\"k : \",ok,\"i: \",i,\"j: \",j,\"beta({},{}): \".format(j,t+1),beta[j][t+1],\"ok value: \",ok)\n",
+        "                print(\"\\t\",state[i][j],\"*\",transition[i][ok],\"*\",beta[i][t+1])\n",
+        "            #temp.append(val)\n",
+        "            beta[i][t]=val\n",
+        "            print(\"\\t beta({}{}): \".format(i,t),val)\n",
+        "            val=0\n",
+        "            #print(\"updated val: \",val,\"updated temp: \",temp,\"\\n\")\n",
+        "    fsum=0\n",
+        "    for i in range(len(state)):\n",
+        "        fsum=fsum+beta[i][1]\n",
+        "    return fsum"
+      ],
+      "metadata": {
+        "id": "xIZ1yukLSCL4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "state=[[0.7,0.3],[0.5,0.5]]\n",
+        "transition=[[0.6,0.1,0.3],[0.1,0.7,0.2]]\n",
+        "words=[2,1,0]\n",
+        "\n",
+        "ans=bpa(pi,state,transition,words)\n"
+      ],
+      "metadata": {
+        "id": "rr4yPQHDSYYA",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "c0fbbcc3-c7ff-4906-f082-f0152207832f"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "k :  0 i:  0 j:  0 beta(0,4):  1 ok value:  0\n",
+            "\t 0.7 * 0.6 * 1\n",
+            "k :  0 i:  0 j:  1 beta(1,4):  1 ok value:  0\n",
+            "\t 0.3 * 0.6 * 1\n",
+            "\t beta(03):  0.6\n",
+            "k :  0 i:  1 j:  0 beta(0,4):  1 ok value:  0\n",
+            "\t 0.5 * 0.1 * 1\n",
+            "k :  0 i:  1 j:  1 beta(1,4):  1 ok value:  0\n",
+            "\t 0.5 * 0.1 * 1\n",
+            "\t beta(13):  0.1\n",
+            "k :  1 i:  0 j:  0 beta(0,3):  0.6 ok value:  1\n",
+            "\t 0.7 * 0.1 * 0.6\n",
+            "k :  1 i:  0 j:  1 beta(1,3):  0.1 ok value:  1\n",
+            "\t 0.3 * 0.1 * 0.6\n",
+            "\t beta(02):  0.045\n",
+            "k :  1 i:  1 j:  0 beta(0,3):  0.6 ok value:  1\n",
+            "\t 0.5 * 0.7 * 0.1\n",
+            "k :  1 i:  1 j:  1 beta(1,3):  0.1 ok value:  1\n",
+            "\t 0.5 * 0.7 * 0.1\n",
+            "\t beta(12):  0.245\n",
+            "k :  2 i:  0 j:  0 beta(0,2):  0.045 ok value:  2\n",
+            "\t 0.7 * 0.3 * 0.045\n",
+            "k :  2 i:  0 j:  1 beta(1,2):  0.245 ok value:  2\n",
+            "\t 0.3 * 0.3 * 0.045\n",
+            "\t beta(01):  0.0315\n",
+            "k :  2 i:  1 j:  0 beta(0,2):  0.045 ok value:  2\n",
+            "\t 0.5 * 0.2 * 0.245\n",
+            "k :  2 i:  1 j:  1 beta(1,2):  0.245 ok value:  2\n",
+            "\t 0.5 * 0.2 * 0.245\n",
+            "\t beta(11):  0.029\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"final probability for given words: \",ans)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "9rmYcYrY3yHl",
+        "outputId": "9a6c3008-27ca-4a21-e20b-ae5397827149"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "final probability for given words:  0.0605\n"
+          ]
+        }
+      ]
+    }
+  ]
+}

noshot/data/AIDS CN NLP/NLP/NLP 4/Chi Square Collocation.ipynb ADDED Viewed

@@ -0,0 +1,208 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "import nltk\n",
+        "nltk.download('stopwords')\n",
+        "nltk.download('punkt')"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DLR4rnnIgiqo",
+        "outputId": "a27f8918-2355-4417-ef8c-908e8aa648a5"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+            "[nltk_data]   Package stopwords is already up-to-date!\n",
+            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+            "[nltk_data]   Package punkt is already up-to-date!\n"
+          ]
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "True"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 2
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nltk.tokenize import word_tokenize\n",
+        "\n",
+        "f=open(\"/content/sample.txt\",\"r\")\n",
+        "text=f.read()\n",
+        "text=text.lower()\n",
+        "word_tokens = word_tokenize(text)\n",
+        "print(word_tokens)\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "t8RO6ZQEgeXB",
+        "outputId": "01e1b0db-e84f-4f39-8735-393f526f47a3"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "['sastra', 'university', 'is', 'good', 'sastra', 'university', 'is', 'in', 'thanjavur', 'trichy', 'is', 'relatively', 'close', 'from', 'sastra', 'university', 'various', 'other', 'university', 'are', 'also', 'present', 'in', 'tamilnadu', 'sastra', 'offers', 'a', 'lot', 'of', 'courses', 'sastra', 'is', 'an', 'acronym', 'nit', 'is', 'also', 'a', 'college', 'near', 'trichy', ',', 'but', 'not', 'a', 'university']\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Qt6BAzwcf5PK"
+      },
+      "outputs": [],
+      "source": [
+        "def collocation(w1,w2):\n",
+        "  nl=list()\n",
+        "  N=len(word_tokens)\n",
+        "  pw1=word_tokens.count(w1)\n",
+        "  pw2=word_tokens.count(w2)\n",
+        "\n",
+        "  Ew1w2= ((pw1*pw2)/N)  \n",
+        "  Ew1nw2= ((pw1*(N-pw2))/N)\n",
+        "  Enw1w2= (((N-pw1)*pw2)/N)\n",
+        "  Enw1nw2= (((N-pw1)*(N-pw2)/N))\n",
+        "\n",
+        "  j=0\n",
+        "  for i in range(len(word_tokens)-1):\n",
+        "    if(word_tokens[i]==w1 and word_tokens[i+1]==w2):\n",
+        "      j=j+1\n",
+        "  pw12=j\n",
+        "  \n",
+        "  Ow1w2=pw12\n",
+        "  Ow1nw2=pw1-pw12\n",
+        "  Onw1w2=pw2-pw12\n",
+        "  Onw1nw2=N-pw12\n",
+        "\n",
+        "  X= (((Ow1w2-Ew1w2)**2)/Ew1w2) + (((Ow1nw2-Ew1nw2)**2)/Ew1nw2) + (((Onw1w2-Enw1w2)**2)/Enw1w2) + (((Onw1nw2-Enw1nw2)**2)/Enw1nw2)\n",
+        "  \n",
+        "  if(float(X) > float(cv)):\n",
+        "    #print(\"hypothesis rejected thus the given words( \",w1,\" \",w2,\" ) form a collocation\")\n",
+        "    #print(X)\n",
+        "    nl.append(w1)\n",
+        "    nl.append(w2)\n",
+        "    nl.append(X)\n",
+        "  return nl"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cv=int(input(\"enter the critical value : \"))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "C-B4kq2of7pS",
+        "outputId": "d61ad4c3-7224-4325-8079-de7c337c8e37"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "enter the critical value : 10\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "fcol=list()\n",
+        "for i in range(len(word_tokens)-1):\n",
+        "    w1=word_tokens[i]\n",
+        "    w2=word_tokens[i+1]\n",
+        "    fcol.append(collocation(w1,w2))\n",
+        "for i in fcol:\n",
+        "  if(len(i) > 1):\n",
+        "    if(fcol.count(i)>1):\n",
+        "      fcol.remove(i)\n",
+        "  else:\n",
+        "    fcol.remove(i)\n",
+        "    \n",
+        "for i in fcol:\n",
+        "  if(len(i) > 1):\n",
+        "    print(i)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lBTZtPcaf-rZ",
+        "outputId": "38a7d8d6-d49c-43b9-9cdd-c0d9b506d0d9"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "['in', 'thanjavur', 22.556565656565656]\n",
+            "['thanjavur', 'trichy', 22.556565656565656]\n",
+            "['relatively', 'close', 46.0]\n",
+            "['close', 'from', 46.0]\n",
+            "['sastra', 'university', 14.952385484830458]\n",
+            "['various', 'other', 46.0]\n",
+            "['are', 'also', 22.556565656565656]\n",
+            "['also', 'present', 22.556565656565656]\n",
+            "['present', 'in', 22.556565656565656]\n",
+            "['in', 'tamilnadu', 22.556565656565656]\n",
+            "['offers', 'a', 14.835831180017228]\n",
+            "['a', 'lot', 14.835831180017228]\n",
+            "['lot', 'of', 46.0]\n",
+            "['of', 'courses', 46.0]\n",
+            "['an', 'acronym', 46.0]\n",
+            "['acronym', 'nit', 46.0]\n",
+            "['a', 'college', 14.835831180017228]\n",
+            "['college', 'near', 46.0]\n",
+            "['near', 'trichy', 22.556565656565656]\n",
+            "['trichy', ',', 22.556565656565656]\n",
+            "[',', 'but', 46.0]\n",
+            "['but', 'not', 46.0]\n",
+            "['not', 'a', 14.835831180017228]\n"
+          ]
+        }
+      ]
+    }
+  ]
+}