noshot 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- noshot/data/ML TS XAI/ML/1. PCA - EDA/PCA-EDA.ipynb +207 -0
- noshot/data/ML TS XAI/ML/1. PCA - EDA/balance-scale.csv +626 -0
- noshot/data/ML TS XAI/ML/1. PCA - EDA/input.txt +625 -0
- noshot/data/ML TS XAI/ML/2. KNN Classifier/KNN.ipynb +287 -0
- noshot/data/ML TS XAI/ML/2. KNN Classifier/balance-scale.csv +626 -0
- noshot/data/ML TS XAI/ML/2. KNN Classifier/input.txt +625 -0
- noshot/data/ML TS XAI/ML/3. Linear Discriminant Analysis/LDA.ipynb +83 -0
- noshot/data/ML TS XAI/ML/3. Linear Discriminant Analysis/balance-scale.csv +626 -0
- noshot/data/ML TS XAI/ML/3. Linear Discriminant Analysis/input.txt +625 -0
- noshot/data/ML TS XAI/ML/4. Linear Regression/Linear-Regression.ipynb +117 -0
- noshot/data/ML TS XAI/ML/4. Linear Regression/machine-data.csv +210 -0
- noshot/data/ML TS XAI/ML/5. Logistic Regression/Logistic-Regression.ipynb +137 -0
- noshot/data/ML TS XAI/ML/5. Logistic Regression/wine-dataset.csv +179 -0
- noshot/data/ML TS XAI/ML/6. Bayesian Classifier/Bayesian.ipynb +129 -0
- noshot/data/ML TS XAI/ML/6. Bayesian Classifier/wine-dataset.csv +179 -0
- {noshot-0.1.7.dist-info → noshot-0.1.8.dist-info}/METADATA +2 -2
- noshot-0.1.8.dist-info/RECORD +24 -0
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(A) Breadth First Search.ipynb +0 -112
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(B) Depth First Search.ipynb +0 -111
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(C) Uniform Cost Search.ipynb +0 -134
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(D) Depth Limites Search.ipynb +0 -115
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(E) Iterative Deepening DFS.ipynb +0 -123
- noshot/data/ML TS XAI/AIDS/10. ANOVA/2_ANOVA.csv +0 -769
- noshot/data/ML TS XAI/AIDS/10. ANOVA/One Way ANOVA (Repeated Measure).ipynb +0 -126
- noshot/data/ML TS XAI/AIDS/10. ANOVA/One Way ANOVA.ipynb +0 -134
- noshot/data/ML TS XAI/AIDS/10. ANOVA/Sample 1 Way ANOVA Test.ipynb +0 -119
- noshot/data/ML TS XAI/AIDS/10. ANOVA/Two Way ANOVA.ipynb +0 -138
- noshot/data/ML TS XAI/AIDS/10. ANOVA/reaction_time.csv +0 -5
- noshot/data/ML TS XAI/AIDS/10. ANOVA/sample_data.csv +0 -16
- noshot/data/ML TS XAI/AIDS/10. ANOVA/sleep_deprivation.csv +0 -4
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/3_Linear.csv +0 -4802
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/Linear Regression LAB.ipynb +0 -113
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/Linear Regression New- sklearn.ipynb +0 -118
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/Linear Regression.ipynb +0 -148
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/house_rate.csv +0 -22
- noshot/data/ML TS XAI/AIDS/12. Logistic Regression/Logistic Regression New- sklearn.ipynb +0 -128
- noshot/data/ML TS XAI/AIDS/12. Logistic Regression/Logistic Regression.ipynb +0 -145
- noshot/data/ML TS XAI/AIDS/12. Logistic Regression/default.csv +0 -1001
- noshot/data/ML TS XAI/AIDS/12. Logistic Regression/hours_scores_records.csv +0 -101
- noshot/data/ML TS XAI/AIDS/2. Implement A Star And MA Star/(A) Astar.ipynb +0 -256
- noshot/data/ML TS XAI/AIDS/2. Implement A Star And MA Star/(B) IDAstar.ipynb +0 -157
- noshot/data/ML TS XAI/AIDS/2. Implement A Star And MA Star/(C) SMAstar.ipynb +0 -178
- noshot/data/ML TS XAI/AIDS/3. Genetic Algorithm/Genetic.ipynb +0 -95
- noshot/data/ML TS XAI/AIDS/4. Simulated Annealing/Simulated Annealing.ipynb +0 -74
- noshot/data/ML TS XAI/AIDS/4. Simulated Annealing/Sudoku Simulated Annealing.ipynb +0 -103
- noshot/data/ML TS XAI/AIDS/5. Alpha Beta Pruning/AlphaBetaPruning.ipynb +0 -182
- noshot/data/ML TS XAI/AIDS/6. Consraint Satisfaction Problems (CSP)/(A) CSP House Allocation.ipynb +0 -120
- noshot/data/ML TS XAI/AIDS/6. Consraint Satisfaction Problems (CSP)/(B) CSP Map Coloring.ipynb +0 -125
- noshot/data/ML TS XAI/AIDS/7. Random Sampling/Random Sampling.ipynb +0 -73
- noshot/data/ML TS XAI/AIDS/7. Random Sampling/height_weight_bmi.csv +0 -8389
- noshot/data/ML TS XAI/AIDS/8. Z Test/Z Test Hash Function.ipynb +0 -141
- noshot/data/ML TS XAI/AIDS/8. Z Test/Z Test.ipynb +0 -151
- noshot/data/ML TS XAI/AIDS/8. Z Test/height_weight_bmi.csv +0 -8389
- noshot/data/ML TS XAI/AIDS/9. T Test/1_heart.csv +0 -304
- noshot/data/ML TS XAI/AIDS/9. T Test/Independent T Test.ipynb +0 -119
- noshot/data/ML TS XAI/AIDS/9. T Test/Paired T Test.ipynb +0 -118
- noshot/data/ML TS XAI/AIDS/9. T Test/T Test Hash Function.ipynb +0 -142
- noshot/data/ML TS XAI/AIDS/9. T Test/T Test.ipynb +0 -158
- noshot/data/ML TS XAI/AIDS/9. T Test/height_weight_bmi.csv +0 -8389
- noshot/data/ML TS XAI/AIDS/9. T Test/iq_test.csv +0 -0
- noshot/data/ML TS XAI/AIDS/Others (AllinOne)/All In One.ipynb +0 -4581
- noshot/data/ML TS XAI/CN/1. Chat Application/chat.java +0 -81
- noshot/data/ML TS XAI/CN/1. Chat Application/output.png +0 -0
- noshot/data/ML TS XAI/CN/1. Chat Application/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/LAN.tcl +0 -65
- noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/analysis.awk +0 -44
- noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/output.png +0 -0
- noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/11. Wireless LAN IEEE 802.11/complexdcf.tcl +0 -229
- noshot/data/ML TS XAI/CN/11. Wireless LAN IEEE 802.11/output.png +0 -0
- noshot/data/ML TS XAI/CN/11. Wireless LAN IEEE 802.11/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/2. File Transfer/file_to_send.txt +0 -2
- noshot/data/ML TS XAI/CN/2. File Transfer/filetransfer.java +0 -119
- noshot/data/ML TS XAI/CN/2. File Transfer/output.png +0 -0
- noshot/data/ML TS XAI/CN/2. File Transfer/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/Client.class +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/MyServerImpl.class +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/MyServerIntf.class +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/Server.class +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/output.png +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/rmi.java +0 -56
- noshot/data/ML TS XAI/CN/4. Wired Network/output.png +0 -0
- noshot/data/ML TS XAI/CN/4. Wired Network/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/4. Wired Network/wired.awk +0 -25
- noshot/data/ML TS XAI/CN/4. Wired Network/wired.tcl +0 -81
- noshot/data/ML TS XAI/CN/5. Wireless Network/output.png +0 -0
- noshot/data/ML TS XAI/CN/5. Wireless Network/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/5. Wireless Network/wireless.awk +0 -27
- noshot/data/ML TS XAI/CN/5. Wireless Network/wireless.tcl +0 -153
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/analysis.awk +0 -27
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/output.png +0 -0
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/sack.tcl +0 -86
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/vegas.tcl +0 -86
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/analysis.awk +0 -28
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/output.png +0 -0
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/reno.tcl +0 -78
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/tahoe.tcl +0 -79
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Flow Control/analysis.awk +0 -27
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Flow Control/flow.tcl +0 -163
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Flow Control/output.png +0 -0
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/DV.tcl +0 -111
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/LS.tcl +0 -106
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/analysis.awk +0 -36
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/output.png +0 -0
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/analysis.awk +0 -20
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/broadcast.tcl +0 -76
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/multicast.tcl +0 -103
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/output.png +0 -0
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/9. DHCP/DHCP.java +0 -125
- noshot/data/ML TS XAI/CN/9. DHCP/output.png +0 -0
- noshot/data/ML TS XAI/CN/9. DHCP/procedure.png +0 -0
- noshot/data/ML TS XAI/NLP/NLP 1/1-Prereqs.py +0 -18
- noshot/data/ML TS XAI/NLP/NLP 1/2-Chi2test.py +0 -83
- noshot/data/ML TS XAI/NLP/NLP 1/2-T-test.py +0 -79
- noshot/data/ML TS XAI/NLP/NLP 1/3-WSD-nb.py +0 -53
- noshot/data/ML TS XAI/NLP/NLP 1/4-Hindle-Rooth.py +0 -53
- noshot/data/ML TS XAI/NLP/NLP 1/5-HMM-Trellis.py +0 -82
- noshot/data/ML TS XAI/NLP/NLP 1/6-HMM-Viterbi.py +0 -16
- noshot/data/ML TS XAI/NLP/NLP 1/7-PCFG-parsetree.py +0 -15
- noshot/data/ML TS XAI/NLP/NLP 1/Chi2test.ipynb +0 -285
- noshot/data/ML TS XAI/NLP/NLP 1/Hindle-Rooth.ipynb +0 -179
- noshot/data/ML TS XAI/NLP/NLP 1/Lab 10 - Text generator using LSTM.ipynb +0 -1461
- noshot/data/ML TS XAI/NLP/NLP 1/Lab 11 NMT.ipynb +0 -2307
- noshot/data/ML TS XAI/NLP/NLP 1/PCFG.ipynb +0 -134
- noshot/data/ML TS XAI/NLP/NLP 1/Prereqs.ipynb +0 -131
- noshot/data/ML TS XAI/NLP/NLP 1/T test.ipynb +0 -252
- noshot/data/ML TS XAI/NLP/NLP 1/TFIDF BOW.ipynb +0 -171
- noshot/data/ML TS XAI/NLP/NLP 1/Trellis.ipynb +0 -244
- noshot/data/ML TS XAI/NLP/NLP 1/WSD.ipynb +0 -645
- noshot/data/ML TS XAI/NLP/NLP 1/Word2Vec.ipynb +0 -93
- noshot/data/ML TS XAI/NLP/NLP 2/Lab01(tokenizer)/tokenizer.ipynb +0 -370
- noshot/data/ML TS XAI/NLP/NLP 2/Lab01(tokenizer)/training_tokenizer.txt +0 -6
- noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/exp0.ipynb +0 -274
- noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/lab2.ipynb +0 -905
- noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/test.txt +0 -1
- noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/tokenizing.ipynb +0 -272
- noshot/data/ML TS XAI/NLP/NLP 2/Lab03(parse-tree)/collocation.ipynb +0 -332
- noshot/data/ML TS XAI/NLP/NLP 2/Lab03(parse-tree)/lab3.ipynb +0 -549
- noshot/data/ML TS XAI/NLP/NLP 2/Lab03(parse-tree)/nlp.txt +0 -1
- noshot/data/ML TS XAI/NLP/NLP 2/Lab04(collocation)/Lab4-NLP-Exp-2.ipynb +0 -817
- noshot/data/ML TS XAI/NLP/NLP 2/Lab04(collocation)/collocation.ipynb +0 -332
- noshot/data/ML TS XAI/NLP/NLP 2/Lab05(WSD)/NLP-Lab-5-Exp3.ipynb +0 -231
- noshot/data/ML TS XAI/NLP/NLP 2/Lab05(WSD)/word-sense-disambiguation.ipynb +0 -507
- noshot/data/ML TS XAI/NLP/NLP 2/Lab06(additional-exercise)/lab6.ipynb +0 -134
- noshot/data/ML TS XAI/NLP/NLP 2/Lab07(HMM,Viterbi)/NLP Exp 4.ipynb +0 -255
- noshot/data/ML TS XAI/NLP/NLP 2/Lab07(HMM,Viterbi)/NLP_Exp_5.ipynb +0 -159
- noshot/data/ML TS XAI/NLP/NLP 2/Lab08(PCFG)/PCFG.ipynb +0 -282
- noshot/data/ML TS XAI/NLP/NLP 2/Lab09-Hindle-rooth&MLP/Lab 9 - MLP classifier.ipynb +0 -670
- noshot/data/ML TS XAI/NLP/NLP 2/Lab09-Hindle-rooth&MLP/MLP-alternative-code.ipynb +0 -613
- noshot/data/ML TS XAI/NLP/NLP 2/Lab09-Hindle-rooth&MLP/hindle-rooth-algorithm.ipynb +0 -74
- noshot/data/ML TS XAI/NLP/NLP 2/Lab10(LSTM)/Lab_10_Text_generator_using_LSTM.ipynb +0 -480
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Machine-translation.ipynb +0 -445
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Viterbi-PCFG.ipynb +0 -105
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/corpora_tools.py +0 -87
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/data_utils.py +0 -11
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/train_translator.py +0 -83
- noshot/data/ML TS XAI/NLP/NLP 2/Lab12(Information-Extraction)/Information_Extraction.ipynb +0 -201
- noshot/data/ML TS XAI/NLP/NLP 3/Backtrack-without-Verbitri.ipynb +0 -185
- noshot/data/ML TS XAI/NLP/NLP 3/Backward-Procedure.ipynb +0 -597
- noshot/data/ML TS XAI/NLP/NLP 3/Bag_of.ipynb +0 -1422
- noshot/data/ML TS XAI/NLP/NLP 3/CYK-algorithm.ipynb +0 -1067
- noshot/data/ML TS XAI/NLP/NLP 3/Forward-Procedure.ipynb +0 -477
- noshot/data/ML TS XAI/NLP/NLP 3/LSTM.ipynb +0 -1290
- noshot/data/ML TS XAI/NLP/NLP 3/Lab 10 - Text generator using LSTM.ipynb +0 -1461
- noshot/data/ML TS XAI/NLP/NLP 3/Lab 11 NMT.ipynb +0 -2307
- noshot/data/ML TS XAI/NLP/NLP 3/NLP-LAB-4.ipynb +0 -216
- noshot/data/ML TS XAI/NLP/NLP 3/NLP-LAB-5.ipynb +0 -216
- noshot/data/ML TS XAI/NLP/NLP 3/abc.txt +0 -6
- noshot/data/ML TS XAI/NLP/NLP 3/ex-1-nltk.ipynb +0 -711
- noshot/data/ML TS XAI/NLP/NLP 3/ex-2-nlp.ipynb +0 -267
- noshot/data/ML TS XAI/NLP/NLP 3/exp8&9.ipynb +0 -305
- noshot/data/ML TS XAI/NLP/NLP 3/hind.ipynb +0 -287
- noshot/data/ML TS XAI/NLP/NLP 3/lab66.ipynb +0 -752
- noshot/data/ML TS XAI/NLP/NLP 3/leb_3.ipynb +0 -612
- noshot/data/ML TS XAI/NLP/NLP 3/naive_bayes_classifier.pkl +0 -0
- noshot/data/ML TS XAI/NLP/NLP 3/nlp_leb_1.ipynb +0 -3008
- noshot/data/ML TS XAI/NLP/NLP 3/nlp_leb_2.ipynb +0 -3095
- noshot/data/ML TS XAI/NLP/NLP 3/nlplab-9.ipynb +0 -295
- noshot/data/ML TS XAI/NLP/NLP 3/nltk-ex-4.ipynb +0 -506
- noshot/data/ML TS XAI/NLP/NLP 3/text1.txt +0 -48
- noshot/data/ML TS XAI/NLP/NLP 3/text2.txt +0 -8
- noshot/data/ML TS XAI/NLP/NLP 3/text3.txt +0 -48
- noshot/data/ML TS XAI/NLP/NLP 3/translation-rnn.ipynb +0 -812
- noshot/data/ML TS XAI/NLP/NLP 3/word2vector.ipynb +0 -173
- noshot/data/ML TS XAI/NLP/NLP 4/Backward Procedure Algorithm.ipynb +0 -179
- noshot/data/ML TS XAI/NLP/NLP 4/Chi Square Collocation.ipynb +0 -208
- noshot/data/ML TS XAI/NLP/NLP 4/Collocation (T test).ipynb +0 -188
- noshot/data/ML TS XAI/NLP/NLP 4/Experiment 1.ipynb +0 -437
- noshot/data/ML TS XAI/NLP/NLP 4/Forward Procedure Algorithm.ipynb +0 -132
- noshot/data/ML TS XAI/NLP/NLP 4/Hindle Rooth.ipynb +0 -414
- noshot/data/ML TS XAI/NLP/NLP 4/MachineTranslation.ipynb +0 -368
- noshot/data/ML TS XAI/NLP/NLP 4/Multi Layer Perceptron using MLPClassifier.ipynb +0 -86
- noshot/data/ML TS XAI/NLP/NLP 4/Multi Layer Perceptron using Tensorflow.ipynb +0 -112
- noshot/data/ML TS XAI/NLP/NLP 4/PCFG Inside Probability.ipynb +0 -451
- noshot/data/ML TS XAI/NLP/NLP 4/Text Generation using LSTM.ipynb +0 -297
- noshot/data/ML TS XAI/NLP/NLP 4/Viterbi.ipynb +0 -310
- noshot/data/ML TS XAI/NLP/NLP 4/Word Sense Disambiguation.ipynb +0 -335
- noshot/data/ML TS XAI/NLP/NLP 5/10.Text Generation using LSTM.ipynb +0 -316
- noshot/data/ML TS XAI/NLP/NLP 5/11.Machine Translation.ipynb +0 -868
- noshot/data/ML TS XAI/NLP/NLP 5/2.T and Chi2 Test.ipynb +0 -204
- noshot/data/ML TS XAI/NLP/NLP 5/3.Word Sense Diambiguation.ipynb +0 -234
- noshot/data/ML TS XAI/NLP/NLP 5/4.Hinddle and Rooth.ipynb +0 -128
- noshot/data/ML TS XAI/NLP/NLP 5/5.Forward and Backward.ipynb +0 -149
- noshot/data/ML TS XAI/NLP/NLP 5/6.Viterbi.ipynb +0 -111
- noshot/data/ML TS XAI/NLP/NLP 5/7.PCFG Parse Tree.ipynb +0 -134
- noshot/data/ML TS XAI/NLP/NLP 5/7.PCFG using cyk.ipynb +0 -101
- noshot/data/ML TS XAI/NLP/NLP 5/8.Bag of words and TF-IDF.ipynb +0 -310
- noshot/data/ML TS XAI/NLP/NLP 5/9.Word2Vector.ipynb +0 -78
- noshot/data/ML TS XAI/NLP/NLP 5/NLP ALL In One.ipynb +0 -2619
- noshot/data/ML TS XAI/NLP/NLP 5/sample1.txt +0 -15
- noshot/data/ML TS XAI/NLP/NLP 5/sample2.txt +0 -4
- noshot/data/ML TS XAI/NLP/NLP 5/word2vec_model.bin +0 -0
- noshot/data/ML TS XAI/NLP/NLP 6/1. Tokenize, Tagging, NER, Parse Tree.ipynb +0 -312
- noshot/data/ML TS XAI/NLP/NLP 6/2. T Test and Chi2 Test.ipynb +0 -185
- noshot/data/ML TS XAI/NLP/NLP 6/3. Naive Bayes WSD.ipynb +0 -199
- noshot/data/ML TS XAI/NLP/NLP 6/4. Hinddle and Rooth.ipynb +0 -151
- noshot/data/ML TS XAI/NLP/NLP 6/5 and 6 FWD, BWD, Viterbi.ipynb +0 -164
- noshot/data/ML TS XAI/NLP/NLP 6/7. PCFG using CYK.ipynb +0 -383
- noshot/data/ML TS XAI/NLP/NLP 6/8. BOW and TF-IDF.ipynb +0 -252
- noshot/data/ML TS XAI/Ubuntu CN Lab.iso +0 -0
- noshot-0.1.7.dist-info/RECORD +0 -216
- {noshot-0.1.7.dist-info → noshot-0.1.8.dist-info}/LICENSE.txt +0 -0
- {noshot-0.1.7.dist-info → noshot-0.1.8.dist-info}/WHEEL +0 -0
- {noshot-0.1.7.dist-info → noshot-0.1.8.dist-info}/top_level.txt +0 -0
noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Machine-translation.ipynb
DELETED
@@ -1,445 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"cells": [
|
3
|
-
{
|
4
|
-
"cell_type": "markdown",
|
5
|
-
"id": "9cc9eb92-1649-44d7-8fbb-765a0b611d62",
|
6
|
-
"metadata": {},
|
7
|
-
"source": [
|
8
|
-
"**Reference**: <https://hub.packtpub.com/create-an-rnn-based-python-machine-translation-system-tutorial/>"
|
9
|
-
]
|
10
|
-
},
|
11
|
-
{
|
12
|
-
"cell_type": "code",
|
13
|
-
"execution_count": 8,
|
14
|
-
"id": "120b7270-bf6e-4204-b259-39b469ba5e90",
|
15
|
-
"metadata": {
|
16
|
-
"tags": []
|
17
|
-
},
|
18
|
-
"outputs": [
|
19
|
-
{
|
20
|
-
"name": "stdout",
|
21
|
-
"output_type": "stream",
|
22
|
-
"text": [
|
23
|
-
"<AlignedSent: 'Wiederaufnahme der S...' -> 'Resumption of the se...'>\n"
|
24
|
-
]
|
25
|
-
}
|
26
|
-
],
|
27
|
-
"source": [
|
28
|
-
"from nltk.corpus import comtrans\n",
|
29
|
-
"print(comtrans.aligned_sents('alignment-de-en.txt')[0])"
|
30
|
-
]
|
31
|
-
},
|
32
|
-
{
|
33
|
-
"cell_type": "code",
|
34
|
-
"execution_count": 9,
|
35
|
-
"id": "d7a2798f-d324-422f-b2d2-a7b938261c16",
|
36
|
-
"metadata": {
|
37
|
-
"tags": []
|
38
|
-
},
|
39
|
-
"outputs": [
|
40
|
-
{
|
41
|
-
"name": "stdout",
|
42
|
-
"output_type": "stream",
|
43
|
-
"text": [
|
44
|
-
"['Wiederaufnahme', 'der', 'Sitzungsperiode']\n",
|
45
|
-
"['Resumption', 'of', 'the', 'session']\n"
|
46
|
-
]
|
47
|
-
}
|
48
|
-
],
|
49
|
-
"source": [
|
50
|
-
"print(comtrans.aligned_sents()[0].words)\n",
|
51
|
-
"print(comtrans.aligned_sents()[0].mots)"
|
52
|
-
]
|
53
|
-
},
|
54
|
-
{
|
55
|
-
"cell_type": "code",
|
56
|
-
"execution_count": 10,
|
57
|
-
"id": "7c4f76f1-8358-4a0f-bf58-307489aeba08",
|
58
|
-
"metadata": {
|
59
|
-
"tags": []
|
60
|
-
},
|
61
|
-
"outputs": [
|
62
|
-
{
|
63
|
-
"name": "stdout",
|
64
|
-
"output_type": "stream",
|
65
|
-
"text": [
|
66
|
-
"0-0 1-1 1-2 2-3\n"
|
67
|
-
]
|
68
|
-
}
|
69
|
-
],
|
70
|
-
"source": [
|
71
|
-
"print(comtrans.aligned_sents()[0].alignment)"
|
72
|
-
]
|
73
|
-
},
|
74
|
-
{
|
75
|
-
"cell_type": "code",
|
76
|
-
"execution_count": 11,
|
77
|
-
"id": "b8ff206f-63f9-48cd-9e11-8ce2ea21e846",
|
78
|
-
"metadata": {
|
79
|
-
"tags": []
|
80
|
-
},
|
81
|
-
"outputs": [],
|
82
|
-
"source": [
|
83
|
-
"import pickle\n",
|
84
|
-
"import re\n",
|
85
|
-
"from collections import Counter\n",
|
86
|
-
"from nltk.corpus import comtrans"
|
87
|
-
]
|
88
|
-
},
|
89
|
-
{
|
90
|
-
"cell_type": "code",
|
91
|
-
"execution_count": 12,
|
92
|
-
"id": "df801d23-b633-4cfb-a0fc-98095dee75b1",
|
93
|
-
"metadata": {
|
94
|
-
"tags": []
|
95
|
-
},
|
96
|
-
"outputs": [],
|
97
|
-
"source": [
|
98
|
-
"def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'):\n",
|
99
|
-
" print(\"Retrieving corpora: {}\".format(translated_sentences_l1_l2))\n",
|
100
|
-
" als = comtrans.aligned_sents(translated_sentences_l1_l2)\n",
|
101
|
-
" sentences_l1 = [sent.words for sent in als]\n",
|
102
|
-
" sentences_l2 = [sent.mots for sent in als]\n",
|
103
|
-
" return sentences_l1, sentences_l2"
|
104
|
-
]
|
105
|
-
},
|
106
|
-
{
|
107
|
-
"cell_type": "code",
|
108
|
-
"execution_count": 13,
|
109
|
-
"id": "c2c17038-bedb-4c9a-9729-727091aab2e0",
|
110
|
-
"metadata": {
|
111
|
-
"tags": []
|
112
|
-
},
|
113
|
-
"outputs": [
|
114
|
-
{
|
115
|
-
"name": "stdout",
|
116
|
-
"output_type": "stream",
|
117
|
-
"text": [
|
118
|
-
"Retrieving corpora: alignment-de-en.txt\n"
|
119
|
-
]
|
120
|
-
},
|
121
|
-
{
|
122
|
-
"name": "stdout",
|
123
|
-
"output_type": "stream",
|
124
|
-
"text": [
|
125
|
-
"# A sentence in the two languages DE & EN\n",
|
126
|
-
"DE: ['Wiederaufnahme', 'der', 'Sitzungsperiode']\n",
|
127
|
-
"EN: ['Resumption', 'of', 'the', 'session']\n",
|
128
|
-
"# Corpora length (i.e. number of sentences)\n",
|
129
|
-
"33334\n"
|
130
|
-
]
|
131
|
-
}
|
132
|
-
],
|
133
|
-
"source": [
|
134
|
-
"sen_l1, sen_l2 = retrieve_corpora()\n",
|
135
|
-
"print(\"# A sentence in the two languages DE & EN\")\n",
|
136
|
-
"print(\"DE:\", sen_l1[0])\n",
|
137
|
-
"print(\"EN:\", sen_l2[0])\n",
|
138
|
-
"print(\"# Corpora length (i.e. number of sentences)\")\n",
|
139
|
-
"print(len(sen_l1))\n",
|
140
|
-
"assert len(sen_l1) == len(sen_l2)"
|
141
|
-
]
|
142
|
-
},
|
143
|
-
{
|
144
|
-
"cell_type": "code",
|
145
|
-
"execution_count": 14,
|
146
|
-
"id": "be6ef737-7982-4c10-8236-d0cf0ec355ed",
|
147
|
-
"metadata": {
|
148
|
-
"tags": []
|
149
|
-
},
|
150
|
-
"outputs": [],
|
151
|
-
"source": [
|
152
|
-
"import re\n",
|
153
|
-
"\n",
|
154
|
-
"def clean_sentence(sentence):\n",
|
155
|
-
" regex_splitter = re.compile(r\"([!?.,:;$'\\\")( ])\")\n",
|
156
|
-
" clean_words = [re.split(regex_splitter, word.lower()) for word in sentence]\n",
|
157
|
-
" return [w for words in clean_words for w in words if words and w]\n"
|
158
|
-
]
|
159
|
-
},
|
160
|
-
{
|
161
|
-
"cell_type": "code",
|
162
|
-
"execution_count": 15,
|
163
|
-
"id": "e99df888-bb6a-4af0-b129-fa128e1f4ee9",
|
164
|
-
"metadata": {
|
165
|
-
"tags": []
|
166
|
-
},
|
167
|
-
"outputs": [
|
168
|
-
{
|
169
|
-
"name": "stdout",
|
170
|
-
"output_type": "stream",
|
171
|
-
"text": [
|
172
|
-
"# Same sentence as before, but chunked and cleaned\n",
|
173
|
-
"DE: ['wiederaufnahme', 'der', 'sitzungsperiode']\n",
|
174
|
-
"EN: ['resumption', 'of', 'the', 'session']\n"
|
175
|
-
]
|
176
|
-
}
|
177
|
-
],
|
178
|
-
"source": [
|
179
|
-
"clean_sen_l1 = [clean_sentence(s) for s in sen_l1]\n",
|
180
|
-
"clean_sen_l2 = [clean_sentence(s) for s in sen_l2]\n",
|
181
|
-
"print(\"# Same sentence as before, but chunked and cleaned\")\n",
|
182
|
-
"print(\"DE:\", clean_sen_l1[0])\n",
|
183
|
-
"print(\"EN:\", clean_sen_l2[0])"
|
184
|
-
]
|
185
|
-
},
|
186
|
-
{
|
187
|
-
"cell_type": "code",
|
188
|
-
"execution_count": 16,
|
189
|
-
"id": "4031c2f0-6432-4c53-9b9d-658128481bb8",
|
190
|
-
"metadata": {
|
191
|
-
"tags": []
|
192
|
-
},
|
193
|
-
"outputs": [],
|
194
|
-
"source": [
|
195
|
-
"def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):\n",
|
196
|
-
" filtered_sentences_l1 = []\n",
|
197
|
-
" filtered_sentences_l2 = []\n",
|
198
|
-
" for i in range(len(sentences_l1)):\n",
|
199
|
-
" if min_len <= len(sentences_l1[i]) <= max_len and min_len <= len(sentences_l2[i]) <= max_len:\n",
|
200
|
-
" filtered_sentences_l1.append(sentences_l1[i])\n",
|
201
|
-
" filtered_sentences_l2.append(sentences_l2[i])\n",
|
202
|
-
" return filtered_sentences_l1, filtered_sentences_l2\n"
|
203
|
-
]
|
204
|
-
},
|
205
|
-
{
|
206
|
-
"cell_type": "code",
|
207
|
-
"execution_count": 17,
|
208
|
-
"id": "5d0f122a-c015-494a-8b44-15b42cc289d4",
|
209
|
-
"metadata": {
|
210
|
-
"tags": []
|
211
|
-
},
|
212
|
-
"outputs": [
|
213
|
-
{
|
214
|
-
"name": "stdout",
|
215
|
-
"output_type": "stream",
|
216
|
-
"text": [
|
217
|
-
"# Filtered Corpora length (i.e. number of sentences)\n",
|
218
|
-
"14788\n"
|
219
|
-
]
|
220
|
-
}
|
221
|
-
],
|
222
|
-
"source": [
|
223
|
-
"filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_sen_l1, \n",
|
224
|
-
" clean_sen_l2)\n",
|
225
|
-
"print(\"# Filtered Corpora length (i.e. number of sentences)\")\n",
|
226
|
-
"print(len(filt_clean_sen_l1))\n",
|
227
|
-
"assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)"
|
228
|
-
]
|
229
|
-
},
|
230
|
-
{
|
231
|
-
"cell_type": "code",
|
232
|
-
"execution_count": 18,
|
233
|
-
"id": "75aa1efa-0a7b-4992-a256-2489241b0989",
|
234
|
-
"metadata": {},
|
235
|
-
"outputs": [],
|
236
|
-
"source": [
|
237
|
-
"import data_utils\n",
|
238
|
-
"\n",
|
239
|
-
"def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):\n",
|
240
|
-
" count_words = Counter()\n",
|
241
|
-
" dict_words = {}\n",
|
242
|
-
" opt_dict_size = len(data_utils.OP_DICT_IDS)\n",
|
243
|
-
" \n",
|
244
|
-
" for sen in sentences:\n",
|
245
|
-
" for word in sen:\n",
|
246
|
-
" count_words[word] += 1\n",
|
247
|
-
"\n",
|
248
|
-
" dict_words[data_utils._PAD] = data_utils.PAD_ID\n",
|
249
|
-
" dict_words[data_utils._GO] = data_utils.GO_ID\n",
|
250
|
-
" dict_words[data_utils._EOS] = data_utils.EOS_ID\n",
|
251
|
-
" dict_words[data_utils._UNK] = data_utils.UNK_ID\n",
|
252
|
-
"\n",
|
253
|
-
" for idx, item in enumerate(count_words.most_common(dict_size)):\n",
|
254
|
-
" dict_words[item[0]] = idx + opt_dict_size\n",
|
255
|
-
"\n",
|
256
|
-
" if storage_path:\n",
|
257
|
-
" pickle.dump(dict_words, open(storage_path, \"wb\"))\n",
|
258
|
-
" \n",
|
259
|
-
" return dict_words\n"
|
260
|
-
]
|
261
|
-
},
|
262
|
-
{
|
263
|
-
"cell_type": "code",
|
264
|
-
"execution_count": 19,
|
265
|
-
"id": "29ca9d76",
|
266
|
-
"metadata": {},
|
267
|
-
"outputs": [],
|
268
|
-
"source": [
|
269
|
-
"def sentences_to_indexes(sentences, indexed_dictionary):\n",
|
270
|
-
" indexed_sentences = []\n",
|
271
|
-
" not_found_counter = 0\n",
|
272
|
-
" \n",
|
273
|
-
" for sent in sentences:\n",
|
274
|
-
" idx_sent = []\n",
|
275
|
-
" for word in sent:\n",
|
276
|
-
" try:\n",
|
277
|
-
" idx_sent.append(indexed_dictionary[word])\n",
|
278
|
-
" except KeyError:\n",
|
279
|
-
" idx_sent.append(data_utils.UNK_ID)\n",
|
280
|
-
" not_found_counter += 1\n",
|
281
|
-
" indexed_sentences.append(idx_sent)\n",
|
282
|
-
" \n",
|
283
|
-
" print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))\n",
|
284
|
-
" return indexed_sentences\n"
|
285
|
-
]
|
286
|
-
},
|
287
|
-
{
|
288
|
-
"cell_type": "code",
|
289
|
-
"execution_count": 21,
|
290
|
-
"id": "3e4a7aa8",
|
291
|
-
"metadata": {},
|
292
|
-
"outputs": [
|
293
|
-
{
|
294
|
-
"name": "stdout",
|
295
|
-
"output_type": "stream",
|
296
|
-
"text": [
|
297
|
-
"[sentences_to_indexes] Did not find 0 words\n",
|
298
|
-
"[sentences_to_indexes] Did not find 0 words\n",
|
299
|
-
"# Same sentences as before, with their dictionary ID\n",
|
300
|
-
"DE: [('sentence', 4), ('one', 8), ('for', 5), ('language', 6), ('1', 7)]\n"
|
301
|
-
]
|
302
|
-
}
|
303
|
-
],
|
304
|
-
"source": [
|
305
|
-
"# Example of defining filt_clean_sen_l1 and filt_clean_sen_l2 with actual data\n",
|
306
|
-
"filt_clean_sen_l1 = [\n",
|
307
|
-
" [\"sentence\", \"one\", \"for\", \"language\", \"1\"],\n",
|
308
|
-
" [\"another\", \"sentence\", \"for\", \"language\", \"1\"],\n",
|
309
|
-
" # Add more sentences as needed\n",
|
310
|
-
"]\n",
|
311
|
-
"\n",
|
312
|
-
"filt_clean_sen_l2 = [\n",
|
313
|
-
" [\"sentence\", \"one\", \"for\", \"language\", \"2\"],\n",
|
314
|
-
" [\"another\", \"sentence\", \"for\", \"language\", \"2\"],\n",
|
315
|
-
" # Add more sentences as needed\n",
|
316
|
-
"]\n",
|
317
|
-
"\n",
|
318
|
-
"# Rest of your code remains the same\n",
|
319
|
-
"dict_l1 = create_indexed_dictionary(filt_clean_sen_l1, dict_size=15000, storage_path=\"/tmp/l1_dict.p\")\n",
|
320
|
-
"dict_l2 = create_indexed_dictionary(filt_clean_sen_l2, dict_size=10000, storage_path=\"/tmp/l2_dict.p\")\n",
|
321
|
-
"idx_sentences_l1 = sentences_to_indexes(filt_clean_sen_l1, dict_l1)\n",
|
322
|
-
"idx_sentences_l2 = sentences_to_indexes(filt_clean_sen_l2, dict_l2)\n",
|
323
|
-
"\n",
|
324
|
-
"print(\"# Same sentences as before, with their dictionary ID\")\n",
|
325
|
-
"print(\"DE:\", list(zip(filt_clean_sen_l1[0], idx_sentences_l1[0])))\n"
|
326
|
-
]
|
327
|
-
},
|
328
|
-
{
|
329
|
-
"cell_type": "code",
|
330
|
-
"execution_count": 22,
|
331
|
-
"id": "64fa3be1",
|
332
|
-
"metadata": {},
|
333
|
-
"outputs": [],
|
334
|
-
"source": [
|
335
|
-
"# Same sentences as before, with their dictionary ID\n",
|
336
|
-
"DE: [('wiederaufnahme', 1616), ('der', 7), ('sitzungsperiode', 618)]\n",
|
337
|
-
"EN: [('resumption', 1779), ('of', 8), ('the', 5), ('session', 549)]"
|
338
|
-
]
|
339
|
-
},
|
340
|
-
{
|
341
|
-
"cell_type": "code",
|
342
|
-
"execution_count": 23,
|
343
|
-
"id": "32cf9dad",
|
344
|
-
"metadata": {},
|
345
|
-
"outputs": [],
|
346
|
-
"source": [
|
347
|
-
"def extract_max_length(corpora):\n",
|
348
|
-
" return max([len(sentence) for sentence in corpora])"
|
349
|
-
]
|
350
|
-
},
|
351
|
-
{
|
352
|
-
"cell_type": "code",
|
353
|
-
"execution_count": 24,
|
354
|
-
"id": "256f5ec9",
|
355
|
-
"metadata": {},
|
356
|
-
"outputs": [
|
357
|
-
{
|
358
|
-
"name": "stdout",
|
359
|
-
"output_type": "stream",
|
360
|
-
"text": [
|
361
|
-
"# Max sentence sizes:\n",
|
362
|
-
"DE: 5\n",
|
363
|
-
"EN: 5\n"
|
364
|
-
]
|
365
|
-
}
|
366
|
-
],
|
367
|
-
"source": [
|
368
|
-
"max_length_l1 = extract_max_length(idx_sentences_l1)\n",
|
369
|
-
"max_length_l2 = extract_max_length(idx_sentences_l2)\n",
|
370
|
-
"print(\"# Max sentence sizes:\")\n",
|
371
|
-
"print(\"DE:\", max_length_l1)\n",
|
372
|
-
"print(\"EN:\", max_length_l2)"
|
373
|
-
]
|
374
|
-
},
|
375
|
-
{
|
376
|
-
"cell_type": "code",
|
377
|
-
"execution_count": 25,
|
378
|
-
"id": "e5f5429e",
|
379
|
-
"metadata": {},
|
380
|
-
"outputs": [],
|
381
|
-
"source": [
|
382
|
-
"def prepare_sentences(sentences_l1, sentences_l2, len_l1, len_l2):\n",
|
383
|
-
" assert len(sentences_l1) == len(sentences_l2)\n",
|
384
|
-
" data_set = []\n",
|
385
|
-
" for i in range(len(sentences_l1)):\n",
|
386
|
-
" padding_l1 = len_l1 - len(sentences_l1[i])\n",
|
387
|
-
" pad_sentence_l1 = ([data_utils.PAD_ID]*padding_l1) + sentences_l1[i]\n",
|
388
|
-
" padding_l2 = len_l2 - len(sentences_l2[i])\n",
|
389
|
-
" pad_sentence_l2 = [data_utils.GO_ID] + sentences_l2[i] + [data_utils.EOS_ID] + ([data_utils.PAD_ID] * padding_l2)\n",
|
390
|
-
" data_set.append([pad_sentence_l1, pad_sentence_l2])\n",
|
391
|
-
" return data_set"
|
392
|
-
]
|
393
|
-
},
|
394
|
-
{
|
395
|
-
"cell_type": "code",
|
396
|
-
"execution_count": 26,
|
397
|
-
"id": "6ff5117d",
|
398
|
-
"metadata": {},
|
399
|
-
"outputs": [
|
400
|
-
{
|
401
|
-
"name": "stdout",
|
402
|
-
"output_type": "stream",
|
403
|
-
"text": [
|
404
|
-
"# Prepared minibatch with paddings and extra stuff\n",
|
405
|
-
"DE: [4, 8, 5, 6, 7]\n",
|
406
|
-
"EN: [1, 4, 8, 5, 6, 7, 2]\n",
|
407
|
-
"# The sentence pass from X to Y tokens\n",
|
408
|
-
"DE: 5 -> 5\n",
|
409
|
-
"EN: 5 -> 7\n"
|
410
|
-
]
|
411
|
-
}
|
412
|
-
],
|
413
|
-
"source": [
|
414
|
-
"data_set = prepare_sentences(idx_sentences_l1, idx_sentences_l2, max_length_l1, max_length_l2)\n",
|
415
|
-
"print(\"# Prepared minibatch with paddings and extra stuff\")\n",
|
416
|
-
"print(\"DE:\", data_set[0][0])\n",
|
417
|
-
"print(\"EN:\", data_set[0][1])\n",
|
418
|
-
"print(\"# The sentence pass from X to Y tokens\")\n",
|
419
|
-
"print(\"DE:\", len(idx_sentences_l1[0]), \"->\", len(data_set[0][0]))\n",
|
420
|
-
"print(\"EN:\", len(idx_sentences_l2[0]), \"->\", len(data_set[0][1]))"
|
421
|
-
]
|
422
|
-
}
|
423
|
-
],
|
424
|
-
"metadata": {
|
425
|
-
"kernelspec": {
|
426
|
-
"display_name": "Python 3 (ipykernel)",
|
427
|
-
"language": "python",
|
428
|
-
"name": "python3"
|
429
|
-
},
|
430
|
-
"language_info": {
|
431
|
-
"codemirror_mode": {
|
432
|
-
"name": "ipython",
|
433
|
-
"version": 3
|
434
|
-
},
|
435
|
-
"file_extension": ".py",
|
436
|
-
"mimetype": "text/x-python",
|
437
|
-
"name": "python",
|
438
|
-
"nbconvert_exporter": "python",
|
439
|
-
"pygments_lexer": "ipython3",
|
440
|
-
"version": "3.10.12"
|
441
|
-
}
|
442
|
-
},
|
443
|
-
"nbformat": 4,
|
444
|
-
"nbformat_minor": 5
|
445
|
-
}
|
@@ -1,105 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"cells": [
|
3
|
-
{
|
4
|
-
"cell_type": "code",
|
5
|
-
"execution_count": 3,
|
6
|
-
"id": "9169418e-d21e-44c3-9765-d0406635ac5c",
|
7
|
-
"metadata": {
|
8
|
-
"tags": []
|
9
|
-
},
|
10
|
-
"outputs": [
|
11
|
-
{
|
12
|
-
"name": "stdout",
|
13
|
-
"output_type": "stream",
|
14
|
-
"text": [
|
15
|
-
"Most Probable Parse Tree: ('S', ('NP', ('Det', 'the'), ('N', 'cat')), ('NP', ('Det', 'chased'), ('N', 'bat')))\n",
|
16
|
-
"Parse Probability: 1.0\n"
|
17
|
-
]
|
18
|
-
}
|
19
|
-
],
|
20
|
-
"source": [
|
21
|
-
"from collections import defaultdict\n",
|
22
|
-
"\n",
|
23
|
-
"def viterbi_pcfg(words, pcfg_rules):\n",
|
24
|
-
" n = len(words)\n",
|
25
|
-
" table = [[defaultdict(lambda: (0.0, None)) for _ in range(n)] for _ in range(n)]\n",
|
26
|
-
"\n",
|
27
|
-
" # Initialization\n",
|
28
|
-
" for i, word in enumerate(words):\n",
|
29
|
-
" for nt, (prob, terminals) in pcfg_rules.items():\n",
|
30
|
-
" if word in terminals:\n",
|
31
|
-
" table[i][i][nt] = (prob, None)\n",
|
32
|
-
"\n",
|
33
|
-
" # Viterbi Algorithm\n",
|
34
|
-
" for length in range(2, n + 1):\n",
|
35
|
-
" for i in range(n - length + 1):\n",
|
36
|
-
" j = i + length - 1\n",
|
37
|
-
" for k in range(i, j):\n",
|
38
|
-
" for A, (prob_A, _) in pcfg_rules.items():\n",
|
39
|
-
" for B, (prob_B, _) in pcfg_rules.items():\n",
|
40
|
-
" for C in table[i][k]:\n",
|
41
|
-
" for D in table[k + 1][j]:\n",
|
42
|
-
" prob = prob_A * prob_B * pcfg_rules[A][1].count(C) * pcfg_rules[B][1].count(D)\n",
|
43
|
-
" if prob > table[i][j][A][0]:\n",
|
44
|
-
" table[i][j][A] = (prob, (C, D, k))\n",
|
45
|
-
"\n",
|
46
|
-
" # Reconstruct the most probable parse tree\n",
|
47
|
-
" def reconstruct_tree(i, j, nt):\n",
|
48
|
-
" if table[i][j][nt][1] is None:\n",
|
49
|
-
" return (nt, words[i])\n",
|
50
|
-
" else:\n",
|
51
|
-
" C, D, k = table[i][j][nt][1]\n",
|
52
|
-
" left_subtree = reconstruct_tree(i, k, C)\n",
|
53
|
-
" right_subtree = reconstruct_tree(k + 1, j, D)\n",
|
54
|
-
" return (nt, left_subtree, right_subtree)\n",
|
55
|
-
"\n",
|
56
|
-
" # Get the most probable parse tree and its probability\n",
|
57
|
-
" parse_tree = reconstruct_tree(0, n - 1, 'S')\n",
|
58
|
-
" parse_probability = table[0][-1]['S'][0]\n",
|
59
|
-
"\n",
|
60
|
-
" return parse_tree, parse_probability\n",
|
61
|
-
"\n",
|
62
|
-
"# Different PCFG rules\n",
|
63
|
-
"pcfg_rules = {\n",
|
64
|
-
" 'S': (1.0, ['NP', 'VP']),\n",
|
65
|
-
" 'NP': (0.6, ['Det', 'N']),\n",
|
66
|
-
" 'VP': (0.7, ['V', 'NP']),\n",
|
67
|
-
" 'Det': (1.0, ['the', 'a']),\n",
|
68
|
-
" 'N': (0.5, ['cat', 'dog', 'bat']),\n",
|
69
|
-
" 'V': (0.8, ['chased', 'caught'])\n",
|
70
|
-
"}\n",
|
71
|
-
"\n",
|
72
|
-
"# Different input sentence\n",
|
73
|
-
"words = ['the', 'cat', 'chased', 'a', 'bat']\n",
|
74
|
-
"\n",
|
75
|
-
"# Call Viterbi PCFG algorithm to get the most probable parse tree and its probability\n",
|
76
|
-
"parse_tree, parse_probability = viterbi_pcfg(words, pcfg_rules)\n",
|
77
|
-
"\n",
|
78
|
-
"# Print the most probable parse tree and its probability\n",
|
79
|
-
"print(f'Most Probable Parse Tree: {parse_tree}')\n",
|
80
|
-
"print(f'Parse Probability: {parse_probability}')\n"
|
81
|
-
]
|
82
|
-
}
|
83
|
-
],
|
84
|
-
"metadata": {
|
85
|
-
"kernelspec": {
|
86
|
-
"display_name": "Python 3 (ipykernel)",
|
87
|
-
"language": "python",
|
88
|
-
"name": "python3"
|
89
|
-
},
|
90
|
-
"language_info": {
|
91
|
-
"codemirror_mode": {
|
92
|
-
"name": "ipython",
|
93
|
-
"version": 3
|
94
|
-
},
|
95
|
-
"file_extension": ".py",
|
96
|
-
"mimetype": "text/x-python",
|
97
|
-
"name": "python",
|
98
|
-
"nbconvert_exporter": "python",
|
99
|
-
"pygments_lexer": "ipython3",
|
100
|
-
"version": "3.10.12"
|
101
|
-
}
|
102
|
-
},
|
103
|
-
"nbformat": 4,
|
104
|
-
"nbformat_minor": 5
|
105
|
-
}
|
@@ -1,87 +0,0 @@
|
|
1
|
-
import pickle
|
2
|
-
import re
|
3
|
-
from collections import Counter
|
4
|
-
from nltk.corpus import comtrans
|
5
|
-
|
6
|
-
def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'):
|
7
|
-
print("Retrieving corpora: {}".format(translated_sentences_l1_l2))
|
8
|
-
als = comtrans.aligned_sents(translated_sentences_l1_l2)
|
9
|
-
sentences_l1 = [sent.words for sent in als]
|
10
|
-
sentences_l2 = [sent.mots for sent in als]
|
11
|
-
return sentences_l1, sentences_l2
|
12
|
-
|
13
|
-
sen_l1, sen_l2 = retrieve_corpora()
|
14
|
-
print("# A sentence in the two languages DE & EN")
|
15
|
-
print("DE:", sen_l1[0])
|
16
|
-
print("EN:", sen_l2[0])
|
17
|
-
print("# Corpora length (i.e. number of sentences)")
|
18
|
-
print(len(sen_l1))
|
19
|
-
assert len(sen_l1) == len(sen_l2)
|
20
|
-
|
21
|
-
def clean_sentence(sentence):
|
22
|
-
regex_splitter = re.compile(r"([!?.,:;$'\")( ])")
|
23
|
-
clean_words = [re.split(regex_splitter, word.lower()) for word in sentence]
|
24
|
-
return [w for words in clean_words for w in words if words and w]
|
25
|
-
|
26
|
-
clean_sen_l1 = [clean_sentence(s) for s in sen_l1]
|
27
|
-
clean_sen_l2 = [clean_sentence(s) for s in sen_l2]
|
28
|
-
print("# Same sentence as before, but chunked and cleaned")
|
29
|
-
print("DE:", clean_sen_l1[0])
|
30
|
-
print("EN:", clean_sen_l2[0])
|
31
|
-
|
32
|
-
def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):
|
33
|
-
filtered_sentences_l1 = []
|
34
|
-
filtered_sentences_l2 = []
|
35
|
-
for i in range(len(sentences_l1)):
|
36
|
-
if min_len <= len(sentences_l1[i]) <= max_len and min_len <= len(sentences_l2[i]) <= max_len:
|
37
|
-
filtered_sentences_l1.append(sentences_l1[i])
|
38
|
-
filtered_sentences_l2.append(sentences_l2[i])
|
39
|
-
return filtered_sentences_l1, filtered_sentences_l2
|
40
|
-
|
41
|
-
filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_sen_l1,
|
42
|
-
clean_sen_l2)
|
43
|
-
print("# Filtered Corpora length (i.e. number of sentences)")
|
44
|
-
print(len(filt_clean_sen_l1))
|
45
|
-
assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)
|
46
|
-
|
47
|
-
import data_utils
|
48
|
-
|
49
|
-
def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):
|
50
|
-
count_words = Counter()
|
51
|
-
dict_words = {}
|
52
|
-
opt_dict_size = len(data_utils.OP_DICT_IDS)
|
53
|
-
|
54
|
-
for sen in sentences:
|
55
|
-
for word in sen:
|
56
|
-
count_words[word] += 1
|
57
|
-
|
58
|
-
dict_words[data_utils._PAD] = data_utils.PAD_ID
|
59
|
-
dict_words[data_utils._GO] = data_utils.GO_ID
|
60
|
-
dict_words[data_utils._EOS] = data_utils.EOS_ID
|
61
|
-
dict_words[data_utils._UNK] = data_utils.UNK_ID
|
62
|
-
|
63
|
-
for idx, item in enumerate(count_words.most_common(dict_size)):
|
64
|
-
dict_words[item[0]] = idx + opt_dict_size
|
65
|
-
|
66
|
-
if storage_path:
|
67
|
-
pickle.dump(dict_words, open(storage_path, "wb"))
|
68
|
-
|
69
|
-
return dict_words
|
70
|
-
|
71
|
-
def sentences_to_indexes(sentences, indexed_dictionary):
|
72
|
-
indexed_sentences = []
|
73
|
-
not_found_counter = 0
|
74
|
-
|
75
|
-
for sent in sentences:
|
76
|
-
idx_sent = []
|
77
|
-
for word in sent:
|
78
|
-
try:
|
79
|
-
idx_sent.append(indexed_dictionary[word])
|
80
|
-
except KeyError:
|
81
|
-
idx_sent.append(data_utils.UNK_ID)
|
82
|
-
not_found_counter += 1
|
83
|
-
indexed_sentences.append(idx_sent)
|
84
|
-
|
85
|
-
print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))
|
86
|
-
return indexed_sentences
|
87
|
-
|