noshot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. noshot/__init__.py +1 -0
  2. noshot/data/AIDS CN NLP/AIDS/1. Implement Basic Search Strategies/(A) Breadth First Search.ipynb +112 -0
  3. noshot/data/AIDS CN NLP/AIDS/1. Implement Basic Search Strategies/(B) Depth First Search.ipynb +111 -0
  4. noshot/data/AIDS CN NLP/AIDS/1. Implement Basic Search Strategies/(C) Uniform Cost Search.ipynb +134 -0
  5. noshot/data/AIDS CN NLP/AIDS/1. Implement Basic Search Strategies/(D) Depth Limites Search.ipynb +115 -0
  6. noshot/data/AIDS CN NLP/AIDS/1. Implement Basic Search Strategies/(E) Iterative Deepening DFS.ipynb +123 -0
  7. noshot/data/AIDS CN NLP/AIDS/10. ANOVA/2_ANOVA.csv +769 -0
  8. noshot/data/AIDS CN NLP/AIDS/10. ANOVA/One Way ANOVA (Repeated Measure).ipynb +126 -0
  9. noshot/data/AIDS CN NLP/AIDS/10. ANOVA/One Way ANOVA.ipynb +134 -0
  10. noshot/data/AIDS CN NLP/AIDS/10. ANOVA/Sample 1 Way ANOVA Test.ipynb +119 -0
  11. noshot/data/AIDS CN NLP/AIDS/10. ANOVA/Two Way ANOVA.ipynb +138 -0
  12. noshot/data/AIDS CN NLP/AIDS/10. ANOVA/reaction_time.csv +5 -0
  13. noshot/data/AIDS CN NLP/AIDS/10. ANOVA/sample_data.csv +16 -0
  14. noshot/data/AIDS CN NLP/AIDS/10. ANOVA/sleep_deprivation.csv +4 -0
  15. noshot/data/AIDS CN NLP/AIDS/11. Linear Regression/3_Linear.csv +4802 -0
  16. noshot/data/AIDS CN NLP/AIDS/11. Linear Regression/Linear Regression LAB.ipynb +113 -0
  17. noshot/data/AIDS CN NLP/AIDS/11. Linear Regression/Linear Regression New- sklearn.ipynb +118 -0
  18. noshot/data/AIDS CN NLP/AIDS/11. Linear Regression/Linear Regression.ipynb +148 -0
  19. noshot/data/AIDS CN NLP/AIDS/11. Linear Regression/house_rate.csv +22 -0
  20. noshot/data/AIDS CN NLP/AIDS/12. Logistic Regression/Logistic Regression New- sklearn.ipynb +128 -0
  21. noshot/data/AIDS CN NLP/AIDS/12. Logistic Regression/Logistic Regression.ipynb +145 -0
  22. noshot/data/AIDS CN NLP/AIDS/12. Logistic Regression/default.csv +1001 -0
  23. noshot/data/AIDS CN NLP/AIDS/12. Logistic Regression/hours_scores_records.csv +101 -0
  24. noshot/data/AIDS CN NLP/AIDS/2. Implement A Star And MA Star/(A) Astar.ipynb +256 -0
  25. noshot/data/AIDS CN NLP/AIDS/2. Implement A Star And MA Star/(B) IDAstar.ipynb +157 -0
  26. noshot/data/AIDS CN NLP/AIDS/2. Implement A Star And MA Star/(C) SMAstar.ipynb +178 -0
  27. noshot/data/AIDS CN NLP/AIDS/3. Genetic Algorithm/Genetic.ipynb +95 -0
  28. noshot/data/AIDS CN NLP/AIDS/4. Simulated Annealing/Simulated Annealing.ipynb +74 -0
  29. noshot/data/AIDS CN NLP/AIDS/4. Simulated Annealing/Sudoku Simulated Annealing.ipynb +103 -0
  30. noshot/data/AIDS CN NLP/AIDS/5. Alpha Beta Pruning/AlphaBetaPruning.ipynb +182 -0
  31. noshot/data/AIDS CN NLP/AIDS/6. Consraint Satisfaction Problems (CSP)/(A) CSP House Allocation.ipynb +120 -0
  32. noshot/data/AIDS CN NLP/AIDS/6. Consraint Satisfaction Problems (CSP)/(B) CSP Map Coloring.ipynb +125 -0
  33. noshot/data/AIDS CN NLP/AIDS/7. Random Sampling/Random Sampling.ipynb +73 -0
  34. noshot/data/AIDS CN NLP/AIDS/7. Random Sampling/height_weight_bmi.csv +8389 -0
  35. noshot/data/AIDS CN NLP/AIDS/8. Z Test/Z Test Hash Function.ipynb +141 -0
  36. noshot/data/AIDS CN NLP/AIDS/8. Z Test/Z Test.ipynb +151 -0
  37. noshot/data/AIDS CN NLP/AIDS/8. Z Test/height_weight_bmi.csv +8389 -0
  38. noshot/data/AIDS CN NLP/AIDS/9. T Test/1_heart.csv +304 -0
  39. noshot/data/AIDS CN NLP/AIDS/9. T Test/Independent T Test.ipynb +119 -0
  40. noshot/data/AIDS CN NLP/AIDS/9. T Test/Paired T Test.ipynb +118 -0
  41. noshot/data/AIDS CN NLP/AIDS/9. T Test/T Test Hash Function.ipynb +142 -0
  42. noshot/data/AIDS CN NLP/AIDS/9. T Test/T Test.ipynb +158 -0
  43. noshot/data/AIDS CN NLP/AIDS/9. T Test/height_weight_bmi.csv +8389 -0
  44. noshot/data/AIDS CN NLP/AIDS/9. T Test/iq_test.csv +0 -0
  45. noshot/data/AIDS CN NLP/AIDS/Others (AllinOne)/All In One.ipynb +4581 -0
  46. noshot/data/AIDS CN NLP/CN/1. Chat Application/chat.java +81 -0
  47. noshot/data/AIDS CN NLP/CN/1. Chat Application/output.png +0 -0
  48. noshot/data/AIDS CN NLP/CN/1. Chat Application/procedure.png +0 -0
  49. noshot/data/AIDS CN NLP/CN/10. Ethernet LAN IEEE 802.3/LAN.tcl +65 -0
  50. noshot/data/AIDS CN NLP/CN/10. Ethernet LAN IEEE 802.3/analysis.awk +44 -0
  51. noshot/data/AIDS CN NLP/CN/10. Ethernet LAN IEEE 802.3/output.png +0 -0
  52. noshot/data/AIDS CN NLP/CN/10. Ethernet LAN IEEE 802.3/procedure.png +0 -0
  53. noshot/data/AIDS CN NLP/CN/11. Wireless LAN IEEE 802.11/complexdcf.tcl +229 -0
  54. noshot/data/AIDS CN NLP/CN/11. Wireless LAN IEEE 802.11/output.png +0 -0
  55. noshot/data/AIDS CN NLP/CN/11. Wireless LAN IEEE 802.11/procedure.png +0 -0
  56. noshot/data/AIDS CN NLP/CN/2. File Transfer/file_to_send.txt +2 -0
  57. noshot/data/AIDS CN NLP/CN/2. File Transfer/filetransfer.java +119 -0
  58. noshot/data/AIDS CN NLP/CN/2. File Transfer/output.png +0 -0
  59. noshot/data/AIDS CN NLP/CN/2. File Transfer/procedure.png +0 -0
  60. noshot/data/AIDS CN NLP/CN/3. RMI (Remote Method Invocation)/output.png +0 -0
  61. noshot/data/AIDS CN NLP/CN/3. RMI (Remote Method Invocation)/procedure.png +0 -0
  62. noshot/data/AIDS CN NLP/CN/3. RMI (Remote Method Invocation)/rmi.java +56 -0
  63. noshot/data/AIDS CN NLP/CN/4. Wired Network/output.png +0 -0
  64. noshot/data/AIDS CN NLP/CN/4. Wired Network/procedure.png +0 -0
  65. noshot/data/AIDS CN NLP/CN/4. Wired Network/wired.awk +25 -0
  66. noshot/data/AIDS CN NLP/CN/4. Wired Network/wired.tcl +81 -0
  67. noshot/data/AIDS CN NLP/CN/5. Wireless Network/output.png +0 -0
  68. noshot/data/AIDS CN NLP/CN/5. Wireless Network/procedure.png +0 -0
  69. noshot/data/AIDS CN NLP/CN/5. Wireless Network/wireless.awk +27 -0
  70. noshot/data/AIDS CN NLP/CN/5. Wireless Network/wireless.tcl +153 -0
  71. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/analysis.awk +27 -0
  72. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/output.png +0 -0
  73. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/sack.tcl +86 -0
  74. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/vegas.tcl +86 -0
  75. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/analysis.awk +28 -0
  76. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/output.png +0 -0
  77. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/reno.tcl +78 -0
  78. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/tahoe.tcl +79 -0
  79. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Flow Control/analysis.awk +27 -0
  80. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Flow Control/flow.tcl +163 -0
  81. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/TCP Flow Control/output.png +0 -0
  82. noshot/data/AIDS CN NLP/CN/6. TCP Flow And Congestion Control/procedure.png +0 -0
  83. noshot/data/AIDS CN NLP/CN/7. Link State And Distance Vector Routing/DV.tcl +111 -0
  84. noshot/data/AIDS CN NLP/CN/7. Link State And Distance Vector Routing/LS.tcl +106 -0
  85. noshot/data/AIDS CN NLP/CN/7. Link State And Distance Vector Routing/analysis.awk +36 -0
  86. noshot/data/AIDS CN NLP/CN/7. Link State And Distance Vector Routing/output.png +0 -0
  87. noshot/data/AIDS CN NLP/CN/7. Link State And Distance Vector Routing/procedure.png +0 -0
  88. noshot/data/AIDS CN NLP/CN/8. Multicast And Broadcast Routing/analysis.awk +20 -0
  89. noshot/data/AIDS CN NLP/CN/8. Multicast And Broadcast Routing/broadcast.tcl +76 -0
  90. noshot/data/AIDS CN NLP/CN/8. Multicast And Broadcast Routing/multicast.tcl +103 -0
  91. noshot/data/AIDS CN NLP/CN/8. Multicast And Broadcast Routing/output.png +0 -0
  92. noshot/data/AIDS CN NLP/CN/8. Multicast And Broadcast Routing/procedure.png +0 -0
  93. noshot/data/AIDS CN NLP/CN/9. DHCP/DHCP.java +125 -0
  94. noshot/data/AIDS CN NLP/CN/9. DHCP/output.png +0 -0
  95. noshot/data/AIDS CN NLP/CN/9. DHCP/procedure.png +0 -0
  96. noshot/data/AIDS CN NLP/NLP/NLP 1/1-Prereqs.py +18 -0
  97. noshot/data/AIDS CN NLP/NLP/NLP 1/2-Chi2test.py +83 -0
  98. noshot/data/AIDS CN NLP/NLP/NLP 1/2-T-test.py +79 -0
  99. noshot/data/AIDS CN NLP/NLP/NLP 1/3-WSD-nb.py +53 -0
  100. noshot/data/AIDS CN NLP/NLP/NLP 1/4-Hindle-Rooth.py +53 -0
  101. noshot/data/AIDS CN NLP/NLP/NLP 1/5-HMM-Trellis.py +82 -0
  102. noshot/data/AIDS CN NLP/NLP/NLP 1/6-HMM-Viterbi.py +16 -0
  103. noshot/data/AIDS CN NLP/NLP/NLP 1/7-PCFG-parsetree.py +15 -0
  104. noshot/data/AIDS CN NLP/NLP/NLP 1/Chi2test.ipynb +285 -0
  105. noshot/data/AIDS CN NLP/NLP/NLP 1/Hindle-Rooth.ipynb +179 -0
  106. noshot/data/AIDS CN NLP/NLP/NLP 1/Lab 10 - Text generator using LSTM.ipynb +1461 -0
  107. noshot/data/AIDS CN NLP/NLP/NLP 1/Lab 11 NMT.ipynb +2307 -0
  108. noshot/data/AIDS CN NLP/NLP/NLP 1/PCFG.ipynb +134 -0
  109. noshot/data/AIDS CN NLP/NLP/NLP 1/Prereqs.ipynb +131 -0
  110. noshot/data/AIDS CN NLP/NLP/NLP 1/T test.ipynb +252 -0
  111. noshot/data/AIDS CN NLP/NLP/NLP 1/TFIDF BOW.ipynb +171 -0
  112. noshot/data/AIDS CN NLP/NLP/NLP 1/Trellis.ipynb +244 -0
  113. noshot/data/AIDS CN NLP/NLP/NLP 1/WSD.ipynb +645 -0
  114. noshot/data/AIDS CN NLP/NLP/NLP 1/Word2Vec.ipynb +93 -0
  115. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab01(tokenizer)/tokenizer.ipynb +370 -0
  116. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab01(tokenizer)/training_tokenizer.txt +6 -0
  117. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab02(stemming)/exp0.ipynb +274 -0
  118. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab02(stemming)/lab2.ipynb +905 -0
  119. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab02(stemming)/test.txt +1 -0
  120. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab02(stemming)/tokenizing.ipynb +272 -0
  121. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab03(parse-tree)/collocation.ipynb +332 -0
  122. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab03(parse-tree)/lab3.ipynb +549 -0
  123. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab03(parse-tree)/nlp.txt +1 -0
  124. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab04(collocation)/Lab4-NLP-Exp-2.ipynb +817 -0
  125. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab04(collocation)/collocation.ipynb +332 -0
  126. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab05(WSD)/NLP-Lab-5-Exp3.ipynb +231 -0
  127. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab05(WSD)/word-sense-disambiguation.ipynb +507 -0
  128. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab06(additional-exercise)/lab6.ipynb +134 -0
  129. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab07(HMM,Viterbi)/NLP Exp 4.ipynb +255 -0
  130. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab07(HMM,Viterbi)/NLP_Exp_5.ipynb +159 -0
  131. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab08(PCFG)/PCFG.ipynb +282 -0
  132. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab09-Hindle-rooth&MLP/Lab 9 - MLP classifier.ipynb +670 -0
  133. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab09-Hindle-rooth&MLP/MLP-alternative-code.ipynb +613 -0
  134. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab09-Hindle-rooth&MLP/hindle-rooth-algorithm.ipynb +74 -0
  135. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab10(LSTM)/Lab_10_Text_generator_using_LSTM.ipynb +480 -0
  136. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Machine-translation.ipynb +445 -0
  137. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Viterbi-PCFG.ipynb +105 -0
  138. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/corpora_tools.py +87 -0
  139. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/data_utils.py +11 -0
  140. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/train_translator.py +83 -0
  141. noshot/data/AIDS CN NLP/NLP/NLP 2/Lab12(Information-Extraction)/Information_Extraction.ipynb +201 -0
  142. noshot/data/AIDS CN NLP/NLP/NLP 3/Backtrack-without-Verbitri.ipynb +185 -0
  143. noshot/data/AIDS CN NLP/NLP/NLP 3/Backward-Procedure.ipynb +597 -0
  144. noshot/data/AIDS CN NLP/NLP/NLP 3/Bag_of.ipynb +1422 -0
  145. noshot/data/AIDS CN NLP/NLP/NLP 3/CYK-algorithm.ipynb +1067 -0
  146. noshot/data/AIDS CN NLP/NLP/NLP 3/Forward-Procedure.ipynb +477 -0
  147. noshot/data/AIDS CN NLP/NLP/NLP 3/LSTM.ipynb +1290 -0
  148. noshot/data/AIDS CN NLP/NLP/NLP 3/Lab 10 - Text generator using LSTM.ipynb +1461 -0
  149. noshot/data/AIDS CN NLP/NLP/NLP 3/Lab 11 NMT.ipynb +2307 -0
  150. noshot/data/AIDS CN NLP/NLP/NLP 3/NLP-LAB-4.ipynb +216 -0
  151. noshot/data/AIDS CN NLP/NLP/NLP 3/NLP-LAB-5.ipynb +216 -0
  152. noshot/data/AIDS CN NLP/NLP/NLP 3/abc.txt +6 -0
  153. noshot/data/AIDS CN NLP/NLP/NLP 3/ex-1-nltk.ipynb +711 -0
  154. noshot/data/AIDS CN NLP/NLP/NLP 3/ex-2-nlp.ipynb +267 -0
  155. noshot/data/AIDS CN NLP/NLP/NLP 3/exp8&9.ipynb +305 -0
  156. noshot/data/AIDS CN NLP/NLP/NLP 3/hind.ipynb +287 -0
  157. noshot/data/AIDS CN NLP/NLP/NLP 3/lab66.ipynb +752 -0
  158. noshot/data/AIDS CN NLP/NLP/NLP 3/leb_3.ipynb +612 -0
  159. noshot/data/AIDS CN NLP/NLP/NLP 3/naive_bayes_classifier.pkl +0 -0
  160. noshot/data/AIDS CN NLP/NLP/NLP 3/nlp_leb_1.ipynb +3008 -0
  161. noshot/data/AIDS CN NLP/NLP/NLP 3/nlp_leb_2.ipynb +3095 -0
  162. noshot/data/AIDS CN NLP/NLP/NLP 3/nlplab-9.ipynb +295 -0
  163. noshot/data/AIDS CN NLP/NLP/NLP 3/nltk-ex-4.ipynb +506 -0
  164. noshot/data/AIDS CN NLP/NLP/NLP 3/text1.txt +48 -0
  165. noshot/data/AIDS CN NLP/NLP/NLP 3/text2.txt +8 -0
  166. noshot/data/AIDS CN NLP/NLP/NLP 3/text3.txt +48 -0
  167. noshot/data/AIDS CN NLP/NLP/NLP 3/translation-rnn.ipynb +812 -0
  168. noshot/data/AIDS CN NLP/NLP/NLP 3/word2vector.ipynb +173 -0
  169. noshot/data/AIDS CN NLP/NLP/NLP 4/Backward Procedure Algorithm.ipynb +179 -0
  170. noshot/data/AIDS CN NLP/NLP/NLP 4/Chi Square Collocation.ipynb +208 -0
  171. noshot/data/AIDS CN NLP/NLP/NLP 4/Collocation (T test).ipynb +188 -0
  172. noshot/data/AIDS CN NLP/NLP/NLP 4/Experiment 1.ipynb +437 -0
  173. noshot/data/AIDS CN NLP/NLP/NLP 4/Forward Procedure Algorithm.ipynb +132 -0
  174. noshot/data/AIDS CN NLP/NLP/NLP 4/Hindle Rooth.ipynb +414 -0
  175. noshot/data/AIDS CN NLP/NLP/NLP 4/MachineTranslation.ipynb +368 -0
  176. noshot/data/AIDS CN NLP/NLP/NLP 4/Multi Layer Perceptron using MLPClassifier.ipynb +86 -0
  177. noshot/data/AIDS CN NLP/NLP/NLP 4/Multi Layer Perceptron using Tensorflow.ipynb +112 -0
  178. noshot/data/AIDS CN NLP/NLP/NLP 4/PCFG Inside Probability.ipynb +451 -0
  179. noshot/data/AIDS CN NLP/NLP/NLP 4/Text Generation using LSTM.ipynb +297 -0
  180. noshot/data/AIDS CN NLP/NLP/NLP 4/Viterbi.ipynb +310 -0
  181. noshot/data/AIDS CN NLP/NLP/NLP 4/Word Sense Disambiguation.ipynb +335 -0
  182. noshot/data/AIDS CN NLP/NLP/NLP 5/10.Text Generation using LSTM.ipynb +316 -0
  183. noshot/data/AIDS CN NLP/NLP/NLP 5/11.Machine Translation.ipynb +868 -0
  184. noshot/data/AIDS CN NLP/NLP/NLP 5/2.T and Chi2 Test.ipynb +204 -0
  185. noshot/data/AIDS CN NLP/NLP/NLP 5/3.Word Sense Diambiguation.ipynb +234 -0
  186. noshot/data/AIDS CN NLP/NLP/NLP 5/4.Hinddle and Rooth.ipynb +128 -0
  187. noshot/data/AIDS CN NLP/NLP/NLP 5/5.Forward and Backward.ipynb +149 -0
  188. noshot/data/AIDS CN NLP/NLP/NLP 5/6.Viterbi.ipynb +111 -0
  189. noshot/data/AIDS CN NLP/NLP/NLP 5/7.PCFG Parse Tree.ipynb +134 -0
  190. noshot/data/AIDS CN NLP/NLP/NLP 5/7.PCFG using cyk.ipynb +101 -0
  191. noshot/data/AIDS CN NLP/NLP/NLP 5/8.Bag of words and TF-IDF.ipynb +310 -0
  192. noshot/data/AIDS CN NLP/NLP/NLP 5/9.Word2Vector.ipynb +78 -0
  193. noshot/data/AIDS CN NLP/NLP/NLP 5/NLP ALL In One.ipynb +2619 -0
  194. noshot/data/AIDS CN NLP/NLP/NLP 5/sample1.txt +15 -0
  195. noshot/data/AIDS CN NLP/NLP/NLP 5/sample2.txt +4 -0
  196. noshot/data/AIDS CN NLP/NLP/NLP 5/word2vec_model.bin +0 -0
  197. noshot/data/AIDS CN NLP/NLP/NLP 6/1. Tokenize, Tagging, NER, Parse Tree.ipynb +312 -0
  198. noshot/data/AIDS CN NLP/NLP/NLP 6/2. T Test and Chi2 Test.ipynb +185 -0
  199. noshot/data/AIDS CN NLP/NLP/NLP 6/3. Naive Bayes WSD.ipynb +199 -0
  200. noshot/data/AIDS CN NLP/NLP/NLP 6/4. Hinddle and Rooth.ipynb +151 -0
  201. noshot/data/AIDS CN NLP/NLP/NLP 6/5 and 6 FWD, BWD, Viterbi.ipynb +164 -0
  202. noshot/data/AIDS CN NLP/NLP/NLP 6/7. PCFG using CYK.ipynb +383 -0
  203. noshot/data/AIDS CN NLP/NLP/NLP 6/8. BOW and TF-IDF.ipynb +252 -0
  204. noshot/data/AIDS CN NLP/Ubuntu CN Lab.iso +0 -0
  205. noshot/main.py +47 -0
  206. noshot-0.1.0.dist-info/LICENSE.txt +21 -0
  207. noshot-0.1.0.dist-info/METADATA +65 -0
  208. noshot-0.1.0.dist-info/RECORD +210 -0
  209. noshot-0.1.0.dist-info/WHEEL +5 -0
  210. noshot-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,445 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9cc9eb92-1649-44d7-8fbb-765a0b611d62",
6
+ "metadata": {},
7
+ "source": [
8
+ "**Reference**: <https://hub.packtpub.com/create-an-rnn-based-python-machine-translation-system-tutorial/>"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 8,
14
+ "id": "120b7270-bf6e-4204-b259-39b469ba5e90",
15
+ "metadata": {
16
+ "tags": []
17
+ },
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "<AlignedSent: 'Wiederaufnahme der S...' -> 'Resumption of the se...'>\n"
24
+ ]
25
+ }
26
+ ],
27
+ "source": [
28
+ "from nltk.corpus import comtrans\n",
29
+ "print(comtrans.aligned_sents('alignment-de-en.txt')[0])"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 9,
35
+ "id": "d7a2798f-d324-422f-b2d2-a7b938261c16",
36
+ "metadata": {
37
+ "tags": []
38
+ },
39
+ "outputs": [
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "['Wiederaufnahme', 'der', 'Sitzungsperiode']\n",
45
+ "['Resumption', 'of', 'the', 'session']\n"
46
+ ]
47
+ }
48
+ ],
49
+ "source": [
50
+ "print(comtrans.aligned_sents()[0].words)\n",
51
+ "print(comtrans.aligned_sents()[0].mots)"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 10,
57
+ "id": "7c4f76f1-8358-4a0f-bf58-307489aeba08",
58
+ "metadata": {
59
+ "tags": []
60
+ },
61
+ "outputs": [
62
+ {
63
+ "name": "stdout",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "0-0 1-1 1-2 2-3\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "print(comtrans.aligned_sents()[0].alignment)"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 11,
77
+ "id": "b8ff206f-63f9-48cd-9e11-8ce2ea21e846",
78
+ "metadata": {
79
+ "tags": []
80
+ },
81
+ "outputs": [],
82
+ "source": [
83
+ "import pickle\n",
84
+ "import re\n",
85
+ "from collections import Counter\n",
86
+ "from nltk.corpus import comtrans"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 12,
92
+ "id": "df801d23-b633-4cfb-a0fc-98095dee75b1",
93
+ "metadata": {
94
+ "tags": []
95
+ },
96
+ "outputs": [],
97
+ "source": [
98
+ "def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'):\n",
99
+ " print(\"Retrieving corpora: {}\".format(translated_sentences_l1_l2))\n",
100
+ " als = comtrans.aligned_sents(translated_sentences_l1_l2)\n",
101
+ " sentences_l1 = [sent.words for sent in als]\n",
102
+ " sentences_l2 = [sent.mots for sent in als]\n",
103
+ " return sentences_l1, sentences_l2"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 13,
109
+ "id": "c2c17038-bedb-4c9a-9729-727091aab2e0",
110
+ "metadata": {
111
+ "tags": []
112
+ },
113
+ "outputs": [
114
+ {
115
+ "name": "stdout",
116
+ "output_type": "stream",
117
+ "text": [
118
+ "Retrieving corpora: alignment-de-en.txt\n"
119
+ ]
120
+ },
121
+ {
122
+ "name": "stdout",
123
+ "output_type": "stream",
124
+ "text": [
125
+ "# A sentence in the two languages DE & EN\n",
126
+ "DE: ['Wiederaufnahme', 'der', 'Sitzungsperiode']\n",
127
+ "EN: ['Resumption', 'of', 'the', 'session']\n",
128
+ "# Corpora length (i.e. number of sentences)\n",
129
+ "33334\n"
130
+ ]
131
+ }
132
+ ],
133
+ "source": [
134
+ "sen_l1, sen_l2 = retrieve_corpora()\n",
135
+ "print(\"# A sentence in the two languages DE & EN\")\n",
136
+ "print(\"DE:\", sen_l1[0])\n",
137
+ "print(\"EN:\", sen_l2[0])\n",
138
+ "print(\"# Corpora length (i.e. number of sentences)\")\n",
139
+ "print(len(sen_l1))\n",
140
+ "assert len(sen_l1) == len(sen_l2)"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 14,
146
+ "id": "be6ef737-7982-4c10-8236-d0cf0ec355ed",
147
+ "metadata": {
148
+ "tags": []
149
+ },
150
+ "outputs": [],
151
+ "source": [
152
+ "import re\n",
153
+ "\n",
154
+ "def clean_sentence(sentence):\n",
155
+ " regex_splitter = re.compile(r\"([!?.,:;$'\\\")( ])\")\n",
156
+ " clean_words = [re.split(regex_splitter, word.lower()) for word in sentence]\n",
157
+ " return [w for words in clean_words for w in words if words and w]\n"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 15,
163
+ "id": "e99df888-bb6a-4af0-b129-fa128e1f4ee9",
164
+ "metadata": {
165
+ "tags": []
166
+ },
167
+ "outputs": [
168
+ {
169
+ "name": "stdout",
170
+ "output_type": "stream",
171
+ "text": [
172
+ "# Same sentence as before, but chunked and cleaned\n",
173
+ "DE: ['wiederaufnahme', 'der', 'sitzungsperiode']\n",
174
+ "EN: ['resumption', 'of', 'the', 'session']\n"
175
+ ]
176
+ }
177
+ ],
178
+ "source": [
179
+ "clean_sen_l1 = [clean_sentence(s) for s in sen_l1]\n",
180
+ "clean_sen_l2 = [clean_sentence(s) for s in sen_l2]\n",
181
+ "print(\"# Same sentence as before, but chunked and cleaned\")\n",
182
+ "print(\"DE:\", clean_sen_l1[0])\n",
183
+ "print(\"EN:\", clean_sen_l2[0])"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": 16,
189
+ "id": "4031c2f0-6432-4c53-9b9d-658128481bb8",
190
+ "metadata": {
191
+ "tags": []
192
+ },
193
+ "outputs": [],
194
+ "source": [
195
+ "def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):\n",
196
+ " filtered_sentences_l1 = []\n",
197
+ " filtered_sentences_l2 = []\n",
198
+ " for i in range(len(sentences_l1)):\n",
199
+ " if min_len <= len(sentences_l1[i]) <= max_len and min_len <= len(sentences_l2[i]) <= max_len:\n",
200
+ " filtered_sentences_l1.append(sentences_l1[i])\n",
201
+ " filtered_sentences_l2.append(sentences_l2[i])\n",
202
+ " return filtered_sentences_l1, filtered_sentences_l2\n"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": 17,
208
+ "id": "5d0f122a-c015-494a-8b44-15b42cc289d4",
209
+ "metadata": {
210
+ "tags": []
211
+ },
212
+ "outputs": [
213
+ {
214
+ "name": "stdout",
215
+ "output_type": "stream",
216
+ "text": [
217
+ "# Filtered Corpora length (i.e. number of sentences)\n",
218
+ "14788\n"
219
+ ]
220
+ }
221
+ ],
222
+ "source": [
223
+ "filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_sen_l1, \n",
224
+ " clean_sen_l2)\n",
225
+ "print(\"# Filtered Corpora length (i.e. number of sentences)\")\n",
226
+ "print(len(filt_clean_sen_l1))\n",
227
+ "assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": 18,
233
+ "id": "75aa1efa-0a7b-4992-a256-2489241b0989",
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": [
237
+ "import data_utils\n",
238
+ "\n",
239
+ "def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):\n",
240
+ " count_words = Counter()\n",
241
+ " dict_words = {}\n",
242
+ " opt_dict_size = len(data_utils.OP_DICT_IDS)\n",
243
+ " \n",
244
+ " for sen in sentences:\n",
245
+ " for word in sen:\n",
246
+ " count_words[word] += 1\n",
247
+ "\n",
248
+ " dict_words[data_utils._PAD] = data_utils.PAD_ID\n",
249
+ " dict_words[data_utils._GO] = data_utils.GO_ID\n",
250
+ " dict_words[data_utils._EOS] = data_utils.EOS_ID\n",
251
+ " dict_words[data_utils._UNK] = data_utils.UNK_ID\n",
252
+ "\n",
253
+ " for idx, item in enumerate(count_words.most_common(dict_size)):\n",
254
+ " dict_words[item[0]] = idx + opt_dict_size\n",
255
+ "\n",
256
+ " if storage_path:\n",
257
+ " pickle.dump(dict_words, open(storage_path, \"wb\"))\n",
258
+ " \n",
259
+ " return dict_words\n"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 19,
265
+ "id": "29ca9d76",
266
+ "metadata": {},
267
+ "outputs": [],
268
+ "source": [
269
+ "def sentences_to_indexes(sentences, indexed_dictionary):\n",
270
+ " indexed_sentences = []\n",
271
+ " not_found_counter = 0\n",
272
+ " \n",
273
+ " for sent in sentences:\n",
274
+ " idx_sent = []\n",
275
+ " for word in sent:\n",
276
+ " try:\n",
277
+ " idx_sent.append(indexed_dictionary[word])\n",
278
+ " except KeyError:\n",
279
+ " idx_sent.append(data_utils.UNK_ID)\n",
280
+ " not_found_counter += 1\n",
281
+ " indexed_sentences.append(idx_sent)\n",
282
+ " \n",
283
+ " print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))\n",
284
+ " return indexed_sentences\n"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": 21,
290
+ "id": "3e4a7aa8",
291
+ "metadata": {},
292
+ "outputs": [
293
+ {
294
+ "name": "stdout",
295
+ "output_type": "stream",
296
+ "text": [
297
+ "[sentences_to_indexes] Did not find 0 words\n",
298
+ "[sentences_to_indexes] Did not find 0 words\n",
299
+ "# Same sentences as before, with their dictionary ID\n",
300
+ "DE: [('sentence', 4), ('one', 8), ('for', 5), ('language', 6), ('1', 7)]\n"
301
+ ]
302
+ }
303
+ ],
304
+ "source": [
305
+ "# Example of defining filt_clean_sen_l1 and filt_clean_sen_l2 with actual data\n",
306
+ "filt_clean_sen_l1 = [\n",
307
+ " [\"sentence\", \"one\", \"for\", \"language\", \"1\"],\n",
308
+ " [\"another\", \"sentence\", \"for\", \"language\", \"1\"],\n",
309
+ " # Add more sentences as needed\n",
310
+ "]\n",
311
+ "\n",
312
+ "filt_clean_sen_l2 = [\n",
313
+ " [\"sentence\", \"one\", \"for\", \"language\", \"2\"],\n",
314
+ " [\"another\", \"sentence\", \"for\", \"language\", \"2\"],\n",
315
+ " # Add more sentences as needed\n",
316
+ "]\n",
317
+ "\n",
318
+ "# Rest of your code remains the same\n",
319
+ "dict_l1 = create_indexed_dictionary(filt_clean_sen_l1, dict_size=15000, storage_path=\"/tmp/l1_dict.p\")\n",
320
+ "dict_l2 = create_indexed_dictionary(filt_clean_sen_l2, dict_size=10000, storage_path=\"/tmp/l2_dict.p\")\n",
321
+ "idx_sentences_l1 = sentences_to_indexes(filt_clean_sen_l1, dict_l1)\n",
322
+ "idx_sentences_l2 = sentences_to_indexes(filt_clean_sen_l2, dict_l2)\n",
323
+ "\n",
324
+ "print(\"# Same sentences as before, with their dictionary ID\")\n",
325
+ "print(\"DE:\", list(zip(filt_clean_sen_l1[0], idx_sentences_l1[0])))\n"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": 22,
331
+ "id": "64fa3be1",
332
+ "metadata": {},
333
+ "outputs": [],
334
+ "source": [
335
+ "# Same sentences as before, with their dictionary ID\n",
336
+ "DE: [('wiederaufnahme', 1616), ('der', 7), ('sitzungsperiode', 618)]\n",
337
+ "EN: [('resumption', 1779), ('of', 8), ('the', 5), ('session', 549)]"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": 23,
343
+ "id": "32cf9dad",
344
+ "metadata": {},
345
+ "outputs": [],
346
+ "source": [
347
+ "def extract_max_length(corpora):\n",
348
+ " return max([len(sentence) for sentence in corpora])"
349
+ ]
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "execution_count": 24,
354
+ "id": "256f5ec9",
355
+ "metadata": {},
356
+ "outputs": [
357
+ {
358
+ "name": "stdout",
359
+ "output_type": "stream",
360
+ "text": [
361
+ "# Max sentence sizes:\n",
362
+ "DE: 5\n",
363
+ "EN: 5\n"
364
+ ]
365
+ }
366
+ ],
367
+ "source": [
368
+ "max_length_l1 = extract_max_length(idx_sentences_l1)\n",
369
+ "max_length_l2 = extract_max_length(idx_sentences_l2)\n",
370
+ "print(\"# Max sentence sizes:\")\n",
371
+ "print(\"DE:\", max_length_l1)\n",
372
+ "print(\"EN:\", max_length_l2)"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": 25,
378
+ "id": "e5f5429e",
379
+ "metadata": {},
380
+ "outputs": [],
381
+ "source": [
382
+ "def prepare_sentences(sentences_l1, sentences_l2, len_l1, len_l2):\n",
383
+ " assert len(sentences_l1) == len(sentences_l2)\n",
384
+ " data_set = []\n",
385
+ " for i in range(len(sentences_l1)):\n",
386
+ " padding_l1 = len_l1 - len(sentences_l1[i])\n",
387
+ " pad_sentence_l1 = ([data_utils.PAD_ID]*padding_l1) + sentences_l1[i]\n",
388
+ " padding_l2 = len_l2 - len(sentences_l2[i])\n",
389
+ " pad_sentence_l2 = [data_utils.GO_ID] + sentences_l2[i] + [data_utils.EOS_ID] + ([data_utils.PAD_ID] * padding_l2)\n",
390
+ " data_set.append([pad_sentence_l1, pad_sentence_l2])\n",
391
+ " return data_set"
392
+ ]
393
+ },
394
+ {
395
+ "cell_type": "code",
396
+ "execution_count": 26,
397
+ "id": "6ff5117d",
398
+ "metadata": {},
399
+ "outputs": [
400
+ {
401
+ "name": "stdout",
402
+ "output_type": "stream",
403
+ "text": [
404
+ "# Prepared minibatch with paddings and extra stuff\n",
405
+ "DE: [4, 8, 5, 6, 7]\n",
406
+ "EN: [1, 4, 8, 5, 6, 7, 2]\n",
407
+ "# The sentence pass from X to Y tokens\n",
408
+ "DE: 5 -> 5\n",
409
+ "EN: 5 -> 7\n"
410
+ ]
411
+ }
412
+ ],
413
+ "source": [
414
+ "data_set = prepare_sentences(idx_sentences_l1, idx_sentences_l2, max_length_l1, max_length_l2)\n",
415
+ "print(\"# Prepared minibatch with paddings and extra stuff\")\n",
416
+ "print(\"DE:\", data_set[0][0])\n",
417
+ "print(\"EN:\", data_set[0][1])\n",
418
+ "print(\"# The sentence pass from X to Y tokens\")\n",
419
+ "print(\"DE:\", len(idx_sentences_l1[0]), \"->\", len(data_set[0][0]))\n",
420
+ "print(\"EN:\", len(idx_sentences_l2[0]), \"->\", len(data_set[0][1]))"
421
+ ]
422
+ }
423
+ ],
424
+ "metadata": {
425
+ "kernelspec": {
426
+ "display_name": "Python 3 (ipykernel)",
427
+ "language": "python",
428
+ "name": "python3"
429
+ },
430
+ "language_info": {
431
+ "codemirror_mode": {
432
+ "name": "ipython",
433
+ "version": 3
434
+ },
435
+ "file_extension": ".py",
436
+ "mimetype": "text/x-python",
437
+ "name": "python",
438
+ "nbconvert_exporter": "python",
439
+ "pygments_lexer": "ipython3",
440
+ "version": "3.10.12"
441
+ }
442
+ },
443
+ "nbformat": 4,
444
+ "nbformat_minor": 5
445
+ }
@@ -0,0 +1,105 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "id": "9169418e-d21e-44c3-9765-d0406635ac5c",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [
11
+ {
12
+ "name": "stdout",
13
+ "output_type": "stream",
14
+ "text": [
15
+ "Most Probable Parse Tree: ('S', ('NP', ('Det', 'the'), ('N', 'cat')), ('NP', ('Det', 'chased'), ('N', 'bat')))\n",
16
+ "Parse Probability: 1.0\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "from collections import defaultdict\n",
22
+ "\n",
23
+ "def viterbi_pcfg(words, pcfg_rules):\n",
24
+ " n = len(words)\n",
25
+ " table = [[defaultdict(lambda: (0.0, None)) for _ in range(n)] for _ in range(n)]\n",
26
+ "\n",
27
+ " # Initialization\n",
28
+ " for i, word in enumerate(words):\n",
29
+ " for nt, (prob, terminals) in pcfg_rules.items():\n",
30
+ " if word in terminals:\n",
31
+ " table[i][i][nt] = (prob, None)\n",
32
+ "\n",
33
+ " # Viterbi Algorithm\n",
34
+ " for length in range(2, n + 1):\n",
35
+ " for i in range(n - length + 1):\n",
36
+ " j = i + length - 1\n",
37
+ " for k in range(i, j):\n",
38
+ " for A, (prob_A, _) in pcfg_rules.items():\n",
39
+ " for B, (prob_B, _) in pcfg_rules.items():\n",
40
+ " for C in table[i][k]:\n",
41
+ " for D in table[k + 1][j]:\n",
42
+ " prob = prob_A * prob_B * pcfg_rules[A][1].count(C) * pcfg_rules[B][1].count(D)\n",
43
+ " if prob > table[i][j][A][0]:\n",
44
+ " table[i][j][A] = (prob, (C, D, k))\n",
45
+ "\n",
46
+ " # Reconstruct the most probable parse tree\n",
47
+ " def reconstruct_tree(i, j, nt):\n",
48
+ " if table[i][j][nt][1] is None:\n",
49
+ " return (nt, words[i])\n",
50
+ " else:\n",
51
+ " C, D, k = table[i][j][nt][1]\n",
52
+ " left_subtree = reconstruct_tree(i, k, C)\n",
53
+ " right_subtree = reconstruct_tree(k + 1, j, D)\n",
54
+ " return (nt, left_subtree, right_subtree)\n",
55
+ "\n",
56
+ " # Get the most probable parse tree and its probability\n",
57
+ " parse_tree = reconstruct_tree(0, n - 1, 'S')\n",
58
+ " parse_probability = table[0][-1]['S'][0]\n",
59
+ "\n",
60
+ " return parse_tree, parse_probability\n",
61
+ "\n",
62
+ "# Different PCFG rules\n",
63
+ "pcfg_rules = {\n",
64
+ " 'S': (1.0, ['NP', 'VP']),\n",
65
+ " 'NP': (0.6, ['Det', 'N']),\n",
66
+ " 'VP': (0.7, ['V', 'NP']),\n",
67
+ " 'Det': (1.0, ['the', 'a']),\n",
68
+ " 'N': (0.5, ['cat', 'dog', 'bat']),\n",
69
+ " 'V': (0.8, ['chased', 'caught'])\n",
70
+ "}\n",
71
+ "\n",
72
+ "# Different input sentence\n",
73
+ "words = ['the', 'cat', 'chased', 'a', 'bat']\n",
74
+ "\n",
75
+ "# Call Viterbi PCFG algorithm to get the most probable parse tree and its probability\n",
76
+ "parse_tree, parse_probability = viterbi_pcfg(words, pcfg_rules)\n",
77
+ "\n",
78
+ "# Print the most probable parse tree and its probability\n",
79
+ "print(f'Most Probable Parse Tree: {parse_tree}')\n",
80
+ "print(f'Parse Probability: {parse_probability}')\n"
81
+ ]
82
+ }
83
+ ],
84
+ "metadata": {
85
+ "kernelspec": {
86
+ "display_name": "Python 3 (ipykernel)",
87
+ "language": "python",
88
+ "name": "python3"
89
+ },
90
+ "language_info": {
91
+ "codemirror_mode": {
92
+ "name": "ipython",
93
+ "version": 3
94
+ },
95
+ "file_extension": ".py",
96
+ "mimetype": "text/x-python",
97
+ "name": "python",
98
+ "nbconvert_exporter": "python",
99
+ "pygments_lexer": "ipython3",
100
+ "version": "3.10.12"
101
+ }
102
+ },
103
+ "nbformat": 4,
104
+ "nbformat_minor": 5
105
+ }
@@ -0,0 +1,87 @@
1
+ import pickle
2
+ import re
3
+ from collections import Counter
4
+ from nltk.corpus import comtrans
5
+
6
+ def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'):
7
+ print("Retrieving corpora: {}".format(translated_sentences_l1_l2))
8
+ als = comtrans.aligned_sents(translated_sentences_l1_l2)
9
+ sentences_l1 = [sent.words for sent in als]
10
+ sentences_l2 = [sent.mots for sent in als]
11
+ return sentences_l1, sentences_l2
12
+
13
+ sen_l1, sen_l2 = retrieve_corpora()
14
+ print("# A sentence in the two languages DE & EN")
15
+ print("DE:", sen_l1[0])
16
+ print("EN:", sen_l2[0])
17
+ print("# Corpora length (i.e. number of sentences)")
18
+ print(len(sen_l1))
19
+ assert len(sen_l1) == len(sen_l2)
20
+
21
+ def clean_sentence(sentence):
22
+ regex_splitter = re.compile(r"([!?.,:;$'\")( ])")
23
+ clean_words = [re.split(regex_splitter, word.lower()) for word in sentence]
24
+ return [w for words in clean_words for w in words if words and w]
25
+
26
+ clean_sen_l1 = [clean_sentence(s) for s in sen_l1]
27
+ clean_sen_l2 = [clean_sentence(s) for s in sen_l2]
28
+ print("# Same sentence as before, but chunked and cleaned")
29
+ print("DE:", clean_sen_l1[0])
30
+ print("EN:", clean_sen_l2[0])
31
+
32
+ def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):
33
+ filtered_sentences_l1 = []
34
+ filtered_sentences_l2 = []
35
+ for i in range(len(sentences_l1)):
36
+ if min_len <= len(sentences_l1[i]) <= max_len and min_len <= len(sentences_l2[i]) <= max_len:
37
+ filtered_sentences_l1.append(sentences_l1[i])
38
+ filtered_sentences_l2.append(sentences_l2[i])
39
+ return filtered_sentences_l1, filtered_sentences_l2
40
+
41
+ filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_sen_l1,
42
+ clean_sen_l2)
43
+ print("# Filtered Corpora length (i.e. number of sentences)")
44
+ print(len(filt_clean_sen_l1))
45
+ assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)
46
+
47
+ import data_utils
48
+
49
+ def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):
50
+ count_words = Counter()
51
+ dict_words = {}
52
+ opt_dict_size = len(data_utils.OP_DICT_IDS)
53
+
54
+ for sen in sentences:
55
+ for word in sen:
56
+ count_words[word] += 1
57
+
58
+ dict_words[data_utils._PAD] = data_utils.PAD_ID
59
+ dict_words[data_utils._GO] = data_utils.GO_ID
60
+ dict_words[data_utils._EOS] = data_utils.EOS_ID
61
+ dict_words[data_utils._UNK] = data_utils.UNK_ID
62
+
63
+ for idx, item in enumerate(count_words.most_common(dict_size)):
64
+ dict_words[item[0]] = idx + opt_dict_size
65
+
66
+ if storage_path:
67
+ pickle.dump(dict_words, open(storage_path, "wb"))
68
+
69
+ return dict_words
70
+
71
+ def sentences_to_indexes(sentences, indexed_dictionary):
72
+ indexed_sentences = []
73
+ not_found_counter = 0
74
+
75
+ for sent in sentences:
76
+ idx_sent = []
77
+ for word in sent:
78
+ try:
79
+ idx_sent.append(indexed_dictionary[word])
80
+ except KeyError:
81
+ idx_sent.append(data_utils.UNK_ID)
82
+ not_found_counter += 1
83
+ indexed_sentences.append(idx_sent)
84
+
85
+ print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))
86
+ return indexed_sentences
87
+
@@ -0,0 +1,11 @@
1
+ _PAD = "_PAD"
2
+ _GO = "_GO"
3
+ _EOS = "_EOS"
4
+ _UNK = "_UNK"
5
+ _START_VOCAB = [_PAD, _GO, _EOS, _UNK]
6
+ PAD_ID = 0
7
+ GO_ID = 1
8
+ EOS_ID = 2
9
+ UNK_ID = 3
10
+ OP_DICT_IDS = [PAD_ID, GO_ID, EOS_ID, UNK_ID]\
11
+