PyPI - noshot - Versions diffs - 0.1.0__py3-none-any.whl - Mend

noshot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

noshot/data/AIDS CN NLP/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Machine-translation.ipynb ADDED Viewed

@@ -0,0 +1,445 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9cc9eb92-1649-44d7-8fbb-765a0b611d62",
+   "metadata": {},
+   "source": [
+    "**Reference**: <https://hub.packtpub.com/create-an-rnn-based-python-machine-translation-system-tutorial/>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "120b7270-bf6e-4204-b259-39b469ba5e90",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<AlignedSent: 'Wiederaufnahme der S...' -> 'Resumption of the se...'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "from nltk.corpus import comtrans\n",
+    "print(comtrans.aligned_sents('alignment-de-en.txt')[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d7a2798f-d324-422f-b2d2-a7b938261c16",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Wiederaufnahme', 'der', 'Sitzungsperiode']\n",
+      "['Resumption', 'of', 'the', 'session']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(comtrans.aligned_sents()[0].words)\n",
+    "print(comtrans.aligned_sents()[0].mots)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7c4f76f1-8358-4a0f-bf58-307489aeba08",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0-0 1-1 1-2 2-3\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(comtrans.aligned_sents()[0].alignment)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b8ff206f-63f9-48cd-9e11-8ce2ea21e846",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "import re\n",
+    "from collections import Counter\n",
+    "from nltk.corpus import comtrans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "df801d23-b633-4cfb-a0fc-98095dee75b1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'):\n",
+    "    print(\"Retrieving corpora: {}\".format(translated_sentences_l1_l2))\n",
+    "    als = comtrans.aligned_sents(translated_sentences_l1_l2)\n",
+    "    sentences_l1 = [sent.words for sent in als]\n",
+    "    sentences_l2 = [sent.mots for sent in als]\n",
+    "    return sentences_l1, sentences_l2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c2c17038-bedb-4c9a-9729-727091aab2e0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Retrieving corpora: alignment-de-en.txt\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# A sentence in the two languages DE & EN\n",
+      "DE: ['Wiederaufnahme', 'der', 'Sitzungsperiode']\n",
+      "EN: ['Resumption', 'of', 'the', 'session']\n",
+      "# Corpora length (i.e. number of sentences)\n",
+      "33334\n"
+     ]
+    }
+   ],
+   "source": [
+    "sen_l1, sen_l2 = retrieve_corpora()\n",
+    "print(\"# A sentence in the two languages DE & EN\")\n",
+    "print(\"DE:\", sen_l1[0])\n",
+    "print(\"EN:\", sen_l2[0])\n",
+    "print(\"# Corpora length (i.e. number of sentences)\")\n",
+    "print(len(sen_l1))\n",
+    "assert len(sen_l1) == len(sen_l2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "be6ef737-7982-4c10-8236-d0cf0ec355ed",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def clean_sentence(sentence):\n",
+    "    regex_splitter = re.compile(r\"([!?.,:;$'\\\")( ])\")\n",
+    "    clean_words = [re.split(regex_splitter, word.lower()) for word in sentence]\n",
+    "    return [w for words in clean_words for w in words if words and w]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "e99df888-bb6a-4af0-b129-fa128e1f4ee9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# Same sentence as before, but chunked and cleaned\n",
+      "DE: ['wiederaufnahme', 'der', 'sitzungsperiode']\n",
+      "EN: ['resumption', 'of', 'the', 'session']\n"
+     ]
+    }
+   ],
+   "source": [
+    "clean_sen_l1 = [clean_sentence(s) for s in sen_l1]\n",
+    "clean_sen_l2 = [clean_sentence(s) for s in sen_l2]\n",
+    "print(\"# Same sentence as before, but chunked and cleaned\")\n",
+    "print(\"DE:\", clean_sen_l1[0])\n",
+    "print(\"EN:\", clean_sen_l2[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "4031c2f0-6432-4c53-9b9d-658128481bb8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):\n",
+    "    filtered_sentences_l1 = []\n",
+    "    filtered_sentences_l2 = []\n",
+    "    for i in range(len(sentences_l1)):\n",
+    "        if min_len <= len(sentences_l1[i]) <= max_len and min_len <= len(sentences_l2[i]) <= max_len:\n",
+    "            filtered_sentences_l1.append(sentences_l1[i])\n",
+    "            filtered_sentences_l2.append(sentences_l2[i])\n",
+    "    return filtered_sentences_l1, filtered_sentences_l2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "5d0f122a-c015-494a-8b44-15b42cc289d4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# Filtered Corpora length (i.e. number of sentences)\n",
+      "14788\n"
+     ]
+    }
+   ],
+   "source": [
+    "filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_sen_l1, \n",
+    "          clean_sen_l2)\n",
+    "print(\"# Filtered Corpora length (i.e. number of sentences)\")\n",
+    "print(len(filt_clean_sen_l1))\n",
+    "assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "75aa1efa-0a7b-4992-a256-2489241b0989",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import data_utils\n",
+    "\n",
+    "def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):\n",
+    "    count_words = Counter()\n",
+    "    dict_words = {}\n",
+    "    opt_dict_size = len(data_utils.OP_DICT_IDS)\n",
+    "    \n",
+    "    for sen in sentences:\n",
+    "        for word in sen:\n",
+    "            count_words[word] += 1\n",
+    "\n",
+    "    dict_words[data_utils._PAD] = data_utils.PAD_ID\n",
+    "    dict_words[data_utils._GO] = data_utils.GO_ID\n",
+    "    dict_words[data_utils._EOS] = data_utils.EOS_ID\n",
+    "    dict_words[data_utils._UNK] = data_utils.UNK_ID\n",
+    "\n",
+    "    for idx, item in enumerate(count_words.most_common(dict_size)):\n",
+    "        dict_words[item[0]] = idx + opt_dict_size\n",
+    "\n",
+    "    if storage_path:\n",
+    "        pickle.dump(dict_words, open(storage_path, \"wb\"))\n",
+    "        \n",
+    "    return dict_words\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "29ca9d76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sentences_to_indexes(sentences, indexed_dictionary):\n",
+    "    indexed_sentences = []\n",
+    "    not_found_counter = 0\n",
+    "    \n",
+    "    for sent in sentences:\n",
+    "        idx_sent = []\n",
+    "        for word in sent:\n",
+    "            try:\n",
+    "                idx_sent.append(indexed_dictionary[word])\n",
+    "            except KeyError:\n",
+    "                idx_sent.append(data_utils.UNK_ID)\n",
+    "                not_found_counter += 1\n",
+    "        indexed_sentences.append(idx_sent)\n",
+    "    \n",
+    "    print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))\n",
+    "    return indexed_sentences\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "3e4a7aa8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[sentences_to_indexes] Did not find 0 words\n",
+      "[sentences_to_indexes] Did not find 0 words\n",
+      "# Same sentences as before, with their dictionary ID\n",
+      "DE: [('sentence', 4), ('one', 8), ('for', 5), ('language', 6), ('1', 7)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Example of defining filt_clean_sen_l1 and filt_clean_sen_l2 with actual data\n",
+    "filt_clean_sen_l1 = [\n",
+    "    [\"sentence\", \"one\", \"for\", \"language\", \"1\"],\n",
+    "    [\"another\", \"sentence\", \"for\", \"language\", \"1\"],\n",
+    "    # Add more sentences as needed\n",
+    "]\n",
+    "\n",
+    "filt_clean_sen_l2 = [\n",
+    "    [\"sentence\", \"one\", \"for\", \"language\", \"2\"],\n",
+    "    [\"another\", \"sentence\", \"for\", \"language\", \"2\"],\n",
+    "    # Add more sentences as needed\n",
+    "]\n",
+    "\n",
+    "# Rest of your code remains the same\n",
+    "dict_l1 = create_indexed_dictionary(filt_clean_sen_l1, dict_size=15000, storage_path=\"/tmp/l1_dict.p\")\n",
+    "dict_l2 = create_indexed_dictionary(filt_clean_sen_l2, dict_size=10000, storage_path=\"/tmp/l2_dict.p\")\n",
+    "idx_sentences_l1 = sentences_to_indexes(filt_clean_sen_l1, dict_l1)\n",
+    "idx_sentences_l2 = sentences_to_indexes(filt_clean_sen_l2, dict_l2)\n",
+    "\n",
+    "print(\"# Same sentences as before, with their dictionary ID\")\n",
+    "print(\"DE:\", list(zip(filt_clean_sen_l1[0], idx_sentences_l1[0])))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "64fa3be1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Same sentences as before, with their dictionary ID\n",
+    "DE: [('wiederaufnahme', 1616), ('der', 7), ('sitzungsperiode', 618)]\n",
+    "EN: [('resumption', 1779), ('of', 8), ('the', 5), ('session', 549)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "32cf9dad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_max_length(corpora):\n",
+    "    return max([len(sentence) for sentence in corpora])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "256f5ec9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# Max sentence sizes:\n",
+      "DE: 5\n",
+      "EN: 5\n"
+     ]
+    }
+   ],
+   "source": [
+    "max_length_l1 = extract_max_length(idx_sentences_l1)\n",
+    "max_length_l2 = extract_max_length(idx_sentences_l2)\n",
+    "print(\"# Max sentence sizes:\")\n",
+    "print(\"DE:\", max_length_l1)\n",
+    "print(\"EN:\", max_length_l2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "e5f5429e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_sentences(sentences_l1, sentences_l2, len_l1, len_l2):\n",
+    "    assert len(sentences_l1) == len(sentences_l2)\n",
+    "    data_set = []\n",
+    "    for i in range(len(sentences_l1)):\n",
+    "        padding_l1 = len_l1 - len(sentences_l1[i])\n",
+    "        pad_sentence_l1 = ([data_utils.PAD_ID]*padding_l1) + sentences_l1[i]\n",
+    "        padding_l2 = len_l2 - len(sentences_l2[i])\n",
+    "        pad_sentence_l2 = [data_utils.GO_ID] + sentences_l2[i] + [data_utils.EOS_ID] + ([data_utils.PAD_ID] * padding_l2)\n",
+    "        data_set.append([pad_sentence_l1, pad_sentence_l2])\n",
+    "    return data_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "6ff5117d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# Prepared minibatch with paddings and extra stuff\n",
+      "DE: [4, 8, 5, 6, 7]\n",
+      "EN: [1, 4, 8, 5, 6, 7, 2]\n",
+      "# The sentence pass from X to Y tokens\n",
+      "DE: 5 -> 5\n",
+      "EN: 5 -> 7\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_set = prepare_sentences(idx_sentences_l1, idx_sentences_l2, max_length_l1, max_length_l2)\n",
+    "print(\"# Prepared minibatch with paddings and extra stuff\")\n",
+    "print(\"DE:\", data_set[0][0])\n",
+    "print(\"EN:\", data_set[0][1])\n",
+    "print(\"# The sentence pass from X to Y tokens\")\n",
+    "print(\"DE:\", len(idx_sentences_l1[0]), \"->\", len(data_set[0][0]))\n",
+    "print(\"EN:\", len(idx_sentences_l2[0]), \"->\", len(data_set[0][1]))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

noshot/data/AIDS CN NLP/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Viterbi-PCFG.ipynb ADDED Viewed

@@ -0,0 +1,105 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9169418e-d21e-44c3-9765-d0406635ac5c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most Probable Parse Tree: ('S', ('NP', ('Det', 'the'), ('N', 'cat')), ('NP', ('Det', 'chased'), ('N', 'bat')))\n",
+      "Parse Probability: 1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import defaultdict\n",
+    "\n",
+    "def viterbi_pcfg(words, pcfg_rules):\n",
+    "    n = len(words)\n",
+    "    table = [[defaultdict(lambda: (0.0, None)) for _ in range(n)] for _ in range(n)]\n",
+    "\n",
+    "    # Initialization\n",
+    "    for i, word in enumerate(words):\n",
+    "        for nt, (prob, terminals) in pcfg_rules.items():\n",
+    "            if word in terminals:\n",
+    "                table[i][i][nt] = (prob, None)\n",
+    "\n",
+    "    # Viterbi Algorithm\n",
+    "    for length in range(2, n + 1):\n",
+    "        for i in range(n - length + 1):\n",
+    "            j = i + length - 1\n",
+    "            for k in range(i, j):\n",
+    "                for A, (prob_A, _) in pcfg_rules.items():\n",
+    "                    for B, (prob_B, _) in pcfg_rules.items():\n",
+    "                        for C in table[i][k]:\n",
+    "                            for D in table[k + 1][j]:\n",
+    "                                prob = prob_A * prob_B * pcfg_rules[A][1].count(C) * pcfg_rules[B][1].count(D)\n",
+    "                                if prob > table[i][j][A][0]:\n",
+    "                                    table[i][j][A] = (prob, (C, D, k))\n",
+    "\n",
+    "    # Reconstruct the most probable parse tree\n",
+    "    def reconstruct_tree(i, j, nt):\n",
+    "        if table[i][j][nt][1] is None:\n",
+    "            return (nt, words[i])\n",
+    "        else:\n",
+    "            C, D, k = table[i][j][nt][1]\n",
+    "            left_subtree = reconstruct_tree(i, k, C)\n",
+    "            right_subtree = reconstruct_tree(k + 1, j, D)\n",
+    "            return (nt, left_subtree, right_subtree)\n",
+    "\n",
+    "    # Get the most probable parse tree and its probability\n",
+    "    parse_tree = reconstruct_tree(0, n - 1, 'S')\n",
+    "    parse_probability = table[0][-1]['S'][0]\n",
+    "\n",
+    "    return parse_tree, parse_probability\n",
+    "\n",
+    "# Different PCFG rules\n",
+    "pcfg_rules = {\n",
+    "    'S': (1.0, ['NP', 'VP']),\n",
+    "    'NP': (0.6, ['Det', 'N']),\n",
+    "    'VP': (0.7, ['V', 'NP']),\n",
+    "    'Det': (1.0, ['the', 'a']),\n",
+    "    'N': (0.5, ['cat', 'dog', 'bat']),\n",
+    "    'V': (0.8, ['chased', 'caught'])\n",
+    "}\n",
+    "\n",
+    "# Different input sentence\n",
+    "words = ['the', 'cat', 'chased', 'a', 'bat']\n",
+    "\n",
+    "# Call Viterbi PCFG algorithm to get the most probable parse tree and its probability\n",
+    "parse_tree, parse_probability = viterbi_pcfg(words, pcfg_rules)\n",
+    "\n",
+    "# Print the most probable parse tree and its probability\n",
+    "print(f'Most Probable Parse Tree: {parse_tree}')\n",
+    "print(f'Parse Probability: {parse_probability}')\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

noshot/data/AIDS CN NLP/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/corpora_tools.py ADDED Viewed

@@ -0,0 +1,87 @@
+import pickle
+import re
+from collections import Counter
+from nltk.corpus import comtrans
+def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'):
+    print("Retrieving corpora: {}".format(translated_sentences_l1_l2))
+    als = comtrans.aligned_sents(translated_sentences_l1_l2)
+    sentences_l1 = [sent.words for sent in als]
+    sentences_l2 = [sent.mots for sent in als]
+    return sentences_l1, sentences_l2
+sen_l1, sen_l2 = retrieve_corpora()
+print("# A sentence in the two languages DE & EN")
+print("DE:", sen_l1[0])
+print("EN:", sen_l2[0])
+print("# Corpora length (i.e. number of sentences)")
+print(len(sen_l1))
+assert len(sen_l1) == len(sen_l2)
+def clean_sentence(sentence):
+    regex_splitter = re.compile(r"([!?.,:;$'\")( ])")
+    clean_words = [re.split(regex_splitter, word.lower()) for word in sentence]
+    return [w for words in clean_words for w in words if words and w]
+clean_sen_l1 = [clean_sentence(s) for s in sen_l1]
+clean_sen_l2 = [clean_sentence(s) for s in sen_l2]
+print("# Same sentence as before, but chunked and cleaned")
+print("DE:", clean_sen_l1[0])
+print("EN:", clean_sen_l2[0])
+def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):
+    filtered_sentences_l1 = []
+    filtered_sentences_l2 = []
+    for i in range(len(sentences_l1)):
+        if min_len <= len(sentences_l1[i]) <= max_len and min_len <= len(sentences_l2[i]) <= max_len:
+            filtered_sentences_l1.append(sentences_l1[i])
+            filtered_sentences_l2.append(sentences_l2[i])
+    return filtered_sentences_l1, filtered_sentences_l2
+filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_sen_l1,
+          clean_sen_l2)
+print("# Filtered Corpora length (i.e. number of sentences)")
+print(len(filt_clean_sen_l1))
+assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)
+import data_utils
+def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):
+    count_words = Counter()
+    dict_words = {}
+    opt_dict_size = len(data_utils.OP_DICT_IDS)
+    for sen in sentences:
+        for word in sen:
+            count_words[word] += 1
+    dict_words[data_utils._PAD] = data_utils.PAD_ID
+    dict_words[data_utils._GO] = data_utils.GO_ID
+    dict_words[data_utils._EOS] = data_utils.EOS_ID
+    dict_words[data_utils._UNK] = data_utils.UNK_ID
+    for idx, item in enumerate(count_words.most_common(dict_size)):
+        dict_words[item[0]] = idx + opt_dict_size
+    if storage_path:
+        pickle.dump(dict_words, open(storage_path, "wb"))
+    return dict_words
+def sentences_to_indexes(sentences, indexed_dictionary):
+    indexed_sentences = []
+    not_found_counter = 0
+    for sent in sentences:
+        idx_sent = []
+        for word in sent:
+            try:
+                idx_sent.append(indexed_dictionary[word])
+            except KeyError:
+                idx_sent.append(data_utils.UNK_ID)
+                not_found_counter += 1
+        indexed_sentences.append(idx_sent)
+    print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))
+    return indexed_sentences

noshot/data/AIDS CN NLP/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/data_utils.py ADDED Viewed

@@ -0,0 +1,11 @@
+_PAD = "_PAD"
+_GO = "_GO"
+_EOS = "_EOS"
+_UNK = "_UNK"
+_START_VOCAB = [_PAD, _GO, _EOS, _UNK]
+PAD_ID = 0
+GO_ID = 1
+EOS_ID = 2
+UNK_ID = 3
+OP_DICT_IDS = [PAD_ID, GO_ID, EOS_ID, UNK_ID]\