noshot 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- noshot/data/ML TS XAI/ML/1. PCA - EDA/PCA-EDA.ipynb +207 -0
- noshot/data/ML TS XAI/ML/1. PCA - EDA/balance-scale.csv +626 -0
- noshot/data/ML TS XAI/ML/1. PCA - EDA/input.txt +625 -0
- noshot/data/ML TS XAI/ML/2. KNN Classifier/KNN.ipynb +287 -0
- noshot/data/ML TS XAI/ML/2. KNN Classifier/balance-scale.csv +626 -0
- noshot/data/ML TS XAI/ML/2. KNN Classifier/input.txt +625 -0
- noshot/data/ML TS XAI/ML/3. Linear Discriminant Analysis/LDA.ipynb +83 -0
- noshot/data/ML TS XAI/ML/3. Linear Discriminant Analysis/balance-scale.csv +626 -0
- noshot/data/ML TS XAI/ML/3. Linear Discriminant Analysis/input.txt +625 -0
- noshot/data/ML TS XAI/ML/4. Linear Regression/Linear-Regression.ipynb +117 -0
- noshot/data/ML TS XAI/ML/4. Linear Regression/machine-data.csv +210 -0
- noshot/data/ML TS XAI/ML/5. Logistic Regression/Logistic-Regression.ipynb +137 -0
- noshot/data/ML TS XAI/ML/5. Logistic Regression/wine-dataset.csv +179 -0
- noshot/data/ML TS XAI/ML/6. Bayesian Classifier/Bayesian.ipynb +129 -0
- noshot/data/ML TS XAI/ML/6. Bayesian Classifier/wine-dataset.csv +179 -0
- noshot/data/ML TS XAI/TS/1. EDA - Handling Time Series Data/Handling TS Data.ipynb +784 -0
- noshot/data/ML TS XAI/TS/1. EDA - Handling Time Series Data/raw_sales.csv +29581 -0
- noshot/data/ML TS XAI/TS/2. Feature Engineering/Feature Engineering-.ipynb +1445 -0
- noshot/data/ML TS XAI/TS/3. Temporal Relationships/Exploring Temporal Relationships.ipynb +603 -0
- noshot/data/ML TS XAI/TS/4. Up-Down-Sampling and Interploation/Up-Down-Sampling.ipynb +721 -0
- noshot/data/ML TS XAI/TS/4. Up-Down-Sampling and Interploation/shampoo_sales.csv +37 -0
- noshot/data/ML TS XAI/TS/5. Stationarity - Trend - Seasonality/Stationarity-Trend-Seasonality.ipynb +392 -0
- noshot/data/ML TS XAI/TS/5. Stationarity - Trend - Seasonality/daily-min-temperatures.csv +3651 -0
- noshot/data/ML TS XAI/TS/5. Stationarity - Trend - Seasonality/daily-total-female-births.csv +366 -0
- noshot/data/ML TS XAI/TS/6. Autocorrelation - Partial Autocorrelation/ACF-PACF.ipynb +175 -0
- noshot/data/ML TS XAI/TS/6. Autocorrelation - Partial Autocorrelation/daily-min-temperatures.csv +3651 -0
- {noshot-0.1.7.dist-info → noshot-0.1.9.dist-info}/METADATA +2 -2
- noshot-0.1.9.dist-info/RECORD +35 -0
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(A) Breadth First Search.ipynb +0 -112
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(B) Depth First Search.ipynb +0 -111
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(C) Uniform Cost Search.ipynb +0 -134
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(D) Depth Limites Search.ipynb +0 -115
- noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(E) Iterative Deepening DFS.ipynb +0 -123
- noshot/data/ML TS XAI/AIDS/10. ANOVA/2_ANOVA.csv +0 -769
- noshot/data/ML TS XAI/AIDS/10. ANOVA/One Way ANOVA (Repeated Measure).ipynb +0 -126
- noshot/data/ML TS XAI/AIDS/10. ANOVA/One Way ANOVA.ipynb +0 -134
- noshot/data/ML TS XAI/AIDS/10. ANOVA/Sample 1 Way ANOVA Test.ipynb +0 -119
- noshot/data/ML TS XAI/AIDS/10. ANOVA/Two Way ANOVA.ipynb +0 -138
- noshot/data/ML TS XAI/AIDS/10. ANOVA/reaction_time.csv +0 -5
- noshot/data/ML TS XAI/AIDS/10. ANOVA/sample_data.csv +0 -16
- noshot/data/ML TS XAI/AIDS/10. ANOVA/sleep_deprivation.csv +0 -4
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/3_Linear.csv +0 -4802
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/Linear Regression LAB.ipynb +0 -113
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/Linear Regression New- sklearn.ipynb +0 -118
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/Linear Regression.ipynb +0 -148
- noshot/data/ML TS XAI/AIDS/11. Linear Regression/house_rate.csv +0 -22
- noshot/data/ML TS XAI/AIDS/12. Logistic Regression/Logistic Regression New- sklearn.ipynb +0 -128
- noshot/data/ML TS XAI/AIDS/12. Logistic Regression/Logistic Regression.ipynb +0 -145
- noshot/data/ML TS XAI/AIDS/12. Logistic Regression/default.csv +0 -1001
- noshot/data/ML TS XAI/AIDS/12. Logistic Regression/hours_scores_records.csv +0 -101
- noshot/data/ML TS XAI/AIDS/2. Implement A Star And MA Star/(A) Astar.ipynb +0 -256
- noshot/data/ML TS XAI/AIDS/2. Implement A Star And MA Star/(B) IDAstar.ipynb +0 -157
- noshot/data/ML TS XAI/AIDS/2. Implement A Star And MA Star/(C) SMAstar.ipynb +0 -178
- noshot/data/ML TS XAI/AIDS/3. Genetic Algorithm/Genetic.ipynb +0 -95
- noshot/data/ML TS XAI/AIDS/4. Simulated Annealing/Simulated Annealing.ipynb +0 -74
- noshot/data/ML TS XAI/AIDS/4. Simulated Annealing/Sudoku Simulated Annealing.ipynb +0 -103
- noshot/data/ML TS XAI/AIDS/5. Alpha Beta Pruning/AlphaBetaPruning.ipynb +0 -182
- noshot/data/ML TS XAI/AIDS/6. Consraint Satisfaction Problems (CSP)/(A) CSP House Allocation.ipynb +0 -120
- noshot/data/ML TS XAI/AIDS/6. Consraint Satisfaction Problems (CSP)/(B) CSP Map Coloring.ipynb +0 -125
- noshot/data/ML TS XAI/AIDS/7. Random Sampling/Random Sampling.ipynb +0 -73
- noshot/data/ML TS XAI/AIDS/7. Random Sampling/height_weight_bmi.csv +0 -8389
- noshot/data/ML TS XAI/AIDS/8. Z Test/Z Test Hash Function.ipynb +0 -141
- noshot/data/ML TS XAI/AIDS/8. Z Test/Z Test.ipynb +0 -151
- noshot/data/ML TS XAI/AIDS/8. Z Test/height_weight_bmi.csv +0 -8389
- noshot/data/ML TS XAI/AIDS/9. T Test/1_heart.csv +0 -304
- noshot/data/ML TS XAI/AIDS/9. T Test/Independent T Test.ipynb +0 -119
- noshot/data/ML TS XAI/AIDS/9. T Test/Paired T Test.ipynb +0 -118
- noshot/data/ML TS XAI/AIDS/9. T Test/T Test Hash Function.ipynb +0 -142
- noshot/data/ML TS XAI/AIDS/9. T Test/T Test.ipynb +0 -158
- noshot/data/ML TS XAI/AIDS/9. T Test/height_weight_bmi.csv +0 -8389
- noshot/data/ML TS XAI/AIDS/9. T Test/iq_test.csv +0 -0
- noshot/data/ML TS XAI/AIDS/Others (AllinOne)/All In One.ipynb +0 -4581
- noshot/data/ML TS XAI/CN/1. Chat Application/chat.java +0 -81
- noshot/data/ML TS XAI/CN/1. Chat Application/output.png +0 -0
- noshot/data/ML TS XAI/CN/1. Chat Application/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/LAN.tcl +0 -65
- noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/analysis.awk +0 -44
- noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/output.png +0 -0
- noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/11. Wireless LAN IEEE 802.11/complexdcf.tcl +0 -229
- noshot/data/ML TS XAI/CN/11. Wireless LAN IEEE 802.11/output.png +0 -0
- noshot/data/ML TS XAI/CN/11. Wireless LAN IEEE 802.11/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/2. File Transfer/file_to_send.txt +0 -2
- noshot/data/ML TS XAI/CN/2. File Transfer/filetransfer.java +0 -119
- noshot/data/ML TS XAI/CN/2. File Transfer/output.png +0 -0
- noshot/data/ML TS XAI/CN/2. File Transfer/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/Client.class +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/MyServerImpl.class +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/MyServerIntf.class +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/Server.class +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/output.png +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/rmi.java +0 -56
- noshot/data/ML TS XAI/CN/4. Wired Network/output.png +0 -0
- noshot/data/ML TS XAI/CN/4. Wired Network/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/4. Wired Network/wired.awk +0 -25
- noshot/data/ML TS XAI/CN/4. Wired Network/wired.tcl +0 -81
- noshot/data/ML TS XAI/CN/5. Wireless Network/output.png +0 -0
- noshot/data/ML TS XAI/CN/5. Wireless Network/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/5. Wireless Network/wireless.awk +0 -27
- noshot/data/ML TS XAI/CN/5. Wireless Network/wireless.tcl +0 -153
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/analysis.awk +0 -27
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/output.png +0 -0
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/sack.tcl +0 -86
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/vegas.tcl +0 -86
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/analysis.awk +0 -28
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/output.png +0 -0
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/reno.tcl +0 -78
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/tahoe.tcl +0 -79
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Flow Control/analysis.awk +0 -27
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Flow Control/flow.tcl +0 -163
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Flow Control/output.png +0 -0
- noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/DV.tcl +0 -111
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/LS.tcl +0 -106
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/analysis.awk +0 -36
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/output.png +0 -0
- noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/analysis.awk +0 -20
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/broadcast.tcl +0 -76
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/multicast.tcl +0 -103
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/output.png +0 -0
- noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/procedure.png +0 -0
- noshot/data/ML TS XAI/CN/9. DHCP/DHCP.java +0 -125
- noshot/data/ML TS XAI/CN/9. DHCP/output.png +0 -0
- noshot/data/ML TS XAI/CN/9. DHCP/procedure.png +0 -0
- noshot/data/ML TS XAI/NLP/NLP 1/1-Prereqs.py +0 -18
- noshot/data/ML TS XAI/NLP/NLP 1/2-Chi2test.py +0 -83
- noshot/data/ML TS XAI/NLP/NLP 1/2-T-test.py +0 -79
- noshot/data/ML TS XAI/NLP/NLP 1/3-WSD-nb.py +0 -53
- noshot/data/ML TS XAI/NLP/NLP 1/4-Hindle-Rooth.py +0 -53
- noshot/data/ML TS XAI/NLP/NLP 1/5-HMM-Trellis.py +0 -82
- noshot/data/ML TS XAI/NLP/NLP 1/6-HMM-Viterbi.py +0 -16
- noshot/data/ML TS XAI/NLP/NLP 1/7-PCFG-parsetree.py +0 -15
- noshot/data/ML TS XAI/NLP/NLP 1/Chi2test.ipynb +0 -285
- noshot/data/ML TS XAI/NLP/NLP 1/Hindle-Rooth.ipynb +0 -179
- noshot/data/ML TS XAI/NLP/NLP 1/Lab 10 - Text generator using LSTM.ipynb +0 -1461
- noshot/data/ML TS XAI/NLP/NLP 1/Lab 11 NMT.ipynb +0 -2307
- noshot/data/ML TS XAI/NLP/NLP 1/PCFG.ipynb +0 -134
- noshot/data/ML TS XAI/NLP/NLP 1/Prereqs.ipynb +0 -131
- noshot/data/ML TS XAI/NLP/NLP 1/T test.ipynb +0 -252
- noshot/data/ML TS XAI/NLP/NLP 1/TFIDF BOW.ipynb +0 -171
- noshot/data/ML TS XAI/NLP/NLP 1/Trellis.ipynb +0 -244
- noshot/data/ML TS XAI/NLP/NLP 1/WSD.ipynb +0 -645
- noshot/data/ML TS XAI/NLP/NLP 1/Word2Vec.ipynb +0 -93
- noshot/data/ML TS XAI/NLP/NLP 2/Lab01(tokenizer)/tokenizer.ipynb +0 -370
- noshot/data/ML TS XAI/NLP/NLP 2/Lab01(tokenizer)/training_tokenizer.txt +0 -6
- noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/exp0.ipynb +0 -274
- noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/lab2.ipynb +0 -905
- noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/test.txt +0 -1
- noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/tokenizing.ipynb +0 -272
- noshot/data/ML TS XAI/NLP/NLP 2/Lab03(parse-tree)/collocation.ipynb +0 -332
- noshot/data/ML TS XAI/NLP/NLP 2/Lab03(parse-tree)/lab3.ipynb +0 -549
- noshot/data/ML TS XAI/NLP/NLP 2/Lab03(parse-tree)/nlp.txt +0 -1
- noshot/data/ML TS XAI/NLP/NLP 2/Lab04(collocation)/Lab4-NLP-Exp-2.ipynb +0 -817
- noshot/data/ML TS XAI/NLP/NLP 2/Lab04(collocation)/collocation.ipynb +0 -332
- noshot/data/ML TS XAI/NLP/NLP 2/Lab05(WSD)/NLP-Lab-5-Exp3.ipynb +0 -231
- noshot/data/ML TS XAI/NLP/NLP 2/Lab05(WSD)/word-sense-disambiguation.ipynb +0 -507
- noshot/data/ML TS XAI/NLP/NLP 2/Lab06(additional-exercise)/lab6.ipynb +0 -134
- noshot/data/ML TS XAI/NLP/NLP 2/Lab07(HMM,Viterbi)/NLP Exp 4.ipynb +0 -255
- noshot/data/ML TS XAI/NLP/NLP 2/Lab07(HMM,Viterbi)/NLP_Exp_5.ipynb +0 -159
- noshot/data/ML TS XAI/NLP/NLP 2/Lab08(PCFG)/PCFG.ipynb +0 -282
- noshot/data/ML TS XAI/NLP/NLP 2/Lab09-Hindle-rooth&MLP/Lab 9 - MLP classifier.ipynb +0 -670
- noshot/data/ML TS XAI/NLP/NLP 2/Lab09-Hindle-rooth&MLP/MLP-alternative-code.ipynb +0 -613
- noshot/data/ML TS XAI/NLP/NLP 2/Lab09-Hindle-rooth&MLP/hindle-rooth-algorithm.ipynb +0 -74
- noshot/data/ML TS XAI/NLP/NLP 2/Lab10(LSTM)/Lab_10_Text_generator_using_LSTM.ipynb +0 -480
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Machine-translation.ipynb +0 -445
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Viterbi-PCFG.ipynb +0 -105
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/corpora_tools.py +0 -87
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/data_utils.py +0 -11
- noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/train_translator.py +0 -83
- noshot/data/ML TS XAI/NLP/NLP 2/Lab12(Information-Extraction)/Information_Extraction.ipynb +0 -201
- noshot/data/ML TS XAI/NLP/NLP 3/Backtrack-without-Verbitri.ipynb +0 -185
- noshot/data/ML TS XAI/NLP/NLP 3/Backward-Procedure.ipynb +0 -597
- noshot/data/ML TS XAI/NLP/NLP 3/Bag_of.ipynb +0 -1422
- noshot/data/ML TS XAI/NLP/NLP 3/CYK-algorithm.ipynb +0 -1067
- noshot/data/ML TS XAI/NLP/NLP 3/Forward-Procedure.ipynb +0 -477
- noshot/data/ML TS XAI/NLP/NLP 3/LSTM.ipynb +0 -1290
- noshot/data/ML TS XAI/NLP/NLP 3/Lab 10 - Text generator using LSTM.ipynb +0 -1461
- noshot/data/ML TS XAI/NLP/NLP 3/Lab 11 NMT.ipynb +0 -2307
- noshot/data/ML TS XAI/NLP/NLP 3/NLP-LAB-4.ipynb +0 -216
- noshot/data/ML TS XAI/NLP/NLP 3/NLP-LAB-5.ipynb +0 -216
- noshot/data/ML TS XAI/NLP/NLP 3/abc.txt +0 -6
- noshot/data/ML TS XAI/NLP/NLP 3/ex-1-nltk.ipynb +0 -711
- noshot/data/ML TS XAI/NLP/NLP 3/ex-2-nlp.ipynb +0 -267
- noshot/data/ML TS XAI/NLP/NLP 3/exp8&9.ipynb +0 -305
- noshot/data/ML TS XAI/NLP/NLP 3/hind.ipynb +0 -287
- noshot/data/ML TS XAI/NLP/NLP 3/lab66.ipynb +0 -752
- noshot/data/ML TS XAI/NLP/NLP 3/leb_3.ipynb +0 -612
- noshot/data/ML TS XAI/NLP/NLP 3/naive_bayes_classifier.pkl +0 -0
- noshot/data/ML TS XAI/NLP/NLP 3/nlp_leb_1.ipynb +0 -3008
- noshot/data/ML TS XAI/NLP/NLP 3/nlp_leb_2.ipynb +0 -3095
- noshot/data/ML TS XAI/NLP/NLP 3/nlplab-9.ipynb +0 -295
- noshot/data/ML TS XAI/NLP/NLP 3/nltk-ex-4.ipynb +0 -506
- noshot/data/ML TS XAI/NLP/NLP 3/text1.txt +0 -48
- noshot/data/ML TS XAI/NLP/NLP 3/text2.txt +0 -8
- noshot/data/ML TS XAI/NLP/NLP 3/text3.txt +0 -48
- noshot/data/ML TS XAI/NLP/NLP 3/translation-rnn.ipynb +0 -812
- noshot/data/ML TS XAI/NLP/NLP 3/word2vector.ipynb +0 -173
- noshot/data/ML TS XAI/NLP/NLP 4/Backward Procedure Algorithm.ipynb +0 -179
- noshot/data/ML TS XAI/NLP/NLP 4/Chi Square Collocation.ipynb +0 -208
- noshot/data/ML TS XAI/NLP/NLP 4/Collocation (T test).ipynb +0 -188
- noshot/data/ML TS XAI/NLP/NLP 4/Experiment 1.ipynb +0 -437
- noshot/data/ML TS XAI/NLP/NLP 4/Forward Procedure Algorithm.ipynb +0 -132
- noshot/data/ML TS XAI/NLP/NLP 4/Hindle Rooth.ipynb +0 -414
- noshot/data/ML TS XAI/NLP/NLP 4/MachineTranslation.ipynb +0 -368
- noshot/data/ML TS XAI/NLP/NLP 4/Multi Layer Perceptron using MLPClassifier.ipynb +0 -86
- noshot/data/ML TS XAI/NLP/NLP 4/Multi Layer Perceptron using Tensorflow.ipynb +0 -112
- noshot/data/ML TS XAI/NLP/NLP 4/PCFG Inside Probability.ipynb +0 -451
- noshot/data/ML TS XAI/NLP/NLP 4/Text Generation using LSTM.ipynb +0 -297
- noshot/data/ML TS XAI/NLP/NLP 4/Viterbi.ipynb +0 -310
- noshot/data/ML TS XAI/NLP/NLP 4/Word Sense Disambiguation.ipynb +0 -335
- noshot/data/ML TS XAI/NLP/NLP 5/10.Text Generation using LSTM.ipynb +0 -316
- noshot/data/ML TS XAI/NLP/NLP 5/11.Machine Translation.ipynb +0 -868
- noshot/data/ML TS XAI/NLP/NLP 5/2.T and Chi2 Test.ipynb +0 -204
- noshot/data/ML TS XAI/NLP/NLP 5/3.Word Sense Diambiguation.ipynb +0 -234
- noshot/data/ML TS XAI/NLP/NLP 5/4.Hinddle and Rooth.ipynb +0 -128
- noshot/data/ML TS XAI/NLP/NLP 5/5.Forward and Backward.ipynb +0 -149
- noshot/data/ML TS XAI/NLP/NLP 5/6.Viterbi.ipynb +0 -111
- noshot/data/ML TS XAI/NLP/NLP 5/7.PCFG Parse Tree.ipynb +0 -134
- noshot/data/ML TS XAI/NLP/NLP 5/7.PCFG using cyk.ipynb +0 -101
- noshot/data/ML TS XAI/NLP/NLP 5/8.Bag of words and TF-IDF.ipynb +0 -310
- noshot/data/ML TS XAI/NLP/NLP 5/9.Word2Vector.ipynb +0 -78
- noshot/data/ML TS XAI/NLP/NLP 5/NLP ALL In One.ipynb +0 -2619
- noshot/data/ML TS XAI/NLP/NLP 5/sample1.txt +0 -15
- noshot/data/ML TS XAI/NLP/NLP 5/sample2.txt +0 -4
- noshot/data/ML TS XAI/NLP/NLP 5/word2vec_model.bin +0 -0
- noshot/data/ML TS XAI/NLP/NLP 6/1. Tokenize, Tagging, NER, Parse Tree.ipynb +0 -312
- noshot/data/ML TS XAI/NLP/NLP 6/2. T Test and Chi2 Test.ipynb +0 -185
- noshot/data/ML TS XAI/NLP/NLP 6/3. Naive Bayes WSD.ipynb +0 -199
- noshot/data/ML TS XAI/NLP/NLP 6/4. Hinddle and Rooth.ipynb +0 -151
- noshot/data/ML TS XAI/NLP/NLP 6/5 and 6 FWD, BWD, Viterbi.ipynb +0 -164
- noshot/data/ML TS XAI/NLP/NLP 6/7. PCFG using CYK.ipynb +0 -383
- noshot/data/ML TS XAI/NLP/NLP 6/8. BOW and TF-IDF.ipynb +0 -252
- noshot/data/ML TS XAI/Ubuntu CN Lab.iso +0 -0
- noshot-0.1.7.dist-info/RECORD +0 -216
- {noshot-0.1.7.dist-info → noshot-0.1.9.dist-info}/LICENSE.txt +0 -0
- {noshot-0.1.7.dist-info → noshot-0.1.9.dist-info}/WHEEL +0 -0
- {noshot-0.1.7.dist-info → noshot-0.1.9.dist-info}/top_level.txt +0 -0
@@ -1,711 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"cells": [
|
3
|
-
{
|
4
|
-
"cell_type": "code",
|
5
|
-
"execution_count": 2,
|
6
|
-
"id": "62a71ce0-0a21-45ef-9204-16d27ff27b89",
|
7
|
-
"metadata": {},
|
8
|
-
"outputs": [
|
9
|
-
{
|
10
|
-
"name": "stdout",
|
11
|
-
"output_type": "stream",
|
12
|
-
"text": [
|
13
|
-
"Top 10 Collocations:\n",
|
14
|
-
"1. Bigram: ('accurate', 'actionable'), Frequency: 1\n",
|
15
|
-
"2. Bigram: ('accurately', 'model'), Frequency: 1\n",
|
16
|
-
"3. Bigram: ('action', 'supply'), Frequency: 1\n",
|
17
|
-
"4. Bigram: ('adjusting', 'parameters'), Frequency: 1\n",
|
18
|
-
"5. Bigram: ('advancements', 'technology'), Frequency: 1\n",
|
19
|
-
"6. Bigram: ('allow', 'users'), Frequency: 1\n",
|
20
|
-
"7. Bigram: ('anomalies', 'triggering'), Frequency: 1\n",
|
21
|
-
"8. Bigram: ('assess', 'credit'), Frequency: 1\n",
|
22
|
-
"9. Bigram: ('bandwidth', 'requirements'), Frequency: 1\n",
|
23
|
-
"10. Bigram: ('becomes', 'available'), Frequency: 1\n"
|
24
|
-
]
|
25
|
-
}
|
26
|
-
],
|
27
|
-
"source": [
|
28
|
-
"import nltk\n",
|
29
|
-
"from nltk.tokenize import word_tokenize\n",
|
30
|
-
"from nltk.corpus import stopwords\n",
|
31
|
-
"from nltk.probability import FreqDist\n",
|
32
|
-
"from nltk.collocations import BigramCollocationFinder\n",
|
33
|
-
"from nltk.metrics import BigramAssocMeasures\n",
|
34
|
-
"\n",
|
35
|
-
"# Download required NLTK data\n",
|
36
|
-
"#nltk.download('punkt')\n",
|
37
|
-
"#nltk.download('stopwords')\n",
|
38
|
-
"\n",
|
39
|
-
"# Load text\n",
|
40
|
-
"with open('text3.txt', 'r') as file:\n",
|
41
|
-
" text = file.read()\n",
|
42
|
-
"\n",
|
43
|
-
"# Preprocess text\n",
|
44
|
-
"words = [word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stopwords.words('english')]\n",
|
45
|
-
"\n",
|
46
|
-
"# Calculate bigrams and their frequencies\n",
|
47
|
-
"bigram_finder = BigramCollocationFinder.from_words(words)\n",
|
48
|
-
"bigram_freq = bigram_finder.ngram_fd\n",
|
49
|
-
"\n",
|
50
|
-
"# Calculate top collocations\n",
|
51
|
-
"bigram_measures = BigramAssocMeasures()\n",
|
52
|
-
"collocations = bigram_finder.nbest(bigram_measures.pmi, 10)\n",
|
53
|
-
"\n",
|
54
|
-
"# Print top collocations\n",
|
55
|
-
"print(\"Top 10 Collocations:\")\n",
|
56
|
-
"for i, collocation in enumerate(collocations):\n",
|
57
|
-
" print(f\"{i+1}. Bigram: {collocation}, Frequency: {bigram_freq[collocation]}\")"
|
58
|
-
]
|
59
|
-
},
|
60
|
-
{
|
61
|
-
"cell_type": "code",
|
62
|
-
"execution_count": 23,
|
63
|
-
"id": "d3ef1337-2135-48cd-b1e8-8aa0ab375450",
|
64
|
-
"metadata": {},
|
65
|
-
"outputs": [
|
66
|
-
{
|
67
|
-
"name": "stdout",
|
68
|
-
"output_type": "stream",
|
69
|
-
"text": [
|
70
|
-
"Mean Bigram Frequency: 1.17\n",
|
71
|
-
"\n",
|
72
|
-
"Top 10 Collocations by Mean Probability:\n",
|
73
|
-
"1. Bigram: ('data', 'science'),\t Frequency: 15, \tMean Probability (μ-value): 0.020243\n",
|
74
|
-
"2. Bigram: ('data', 'processing'),\t Frequency: 7, \tMean Probability (μ-value): 0.009447\n",
|
75
|
-
"3. Bigram: ('predictive', 'analytics'),\t Frequency: 5, \tMean Probability (μ-value): 0.006748\n",
|
76
|
-
"4. Bigram: ('data', 'visualization'),\t Frequency: 5, \tMean Probability (μ-value): 0.006748\n",
|
77
|
-
"5. Bigram: ('ai', 'data'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
|
78
|
-
"6. Bigram: ('ai', 'algorithms'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
|
79
|
-
"7. Bigram: ('data', 'cleaning'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
|
80
|
-
"8. Bigram: ('cleaning', 'preparation'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
|
81
|
-
"9. Bigram: ('natural', 'language'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
|
82
|
-
"10. Bigram: ('language', 'processing'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n"
|
83
|
-
]
|
84
|
-
}
|
85
|
-
],
|
86
|
-
"source": [
|
87
|
-
"import nltk\n",
|
88
|
-
"from nltk.tokenize import word_tokenize\n",
|
89
|
-
"from nltk.corpus import stopwords\n",
|
90
|
-
"from nltk.probability import FreqDist\n",
|
91
|
-
"\n",
|
92
|
-
"def preprocess_text(text: str) -> list[str]:\n",
|
93
|
-
" \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
|
94
|
-
" words = word_tokenize(text)\n",
|
95
|
-
" stop_words = set(stopwords.words('english'))\n",
|
96
|
-
" return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
|
97
|
-
"\n",
|
98
|
-
"def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
|
99
|
-
" \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
|
100
|
-
" mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
|
101
|
-
" return mean_probabilities\n",
|
102
|
-
"\n",
|
103
|
-
"def main():\n",
|
104
|
-
" # Download required NLTK data\n",
|
105
|
-
" # nltk.download('punkt')\n",
|
106
|
-
" # nltk.download('stopwords')\n",
|
107
|
-
"\n",
|
108
|
-
" # Load text\n",
|
109
|
-
" with open('text3.txt', 'r') as file:\n",
|
110
|
-
" text = file.read()\n",
|
111
|
-
"\n",
|
112
|
-
" # Preprocess text\n",
|
113
|
-
" words = preprocess_text(text)\n",
|
114
|
-
"\n",
|
115
|
-
" # Calculate word frequency distribution\n",
|
116
|
-
" fdist = FreqDist(words)\n",
|
117
|
-
"\n",
|
118
|
-
" # Calculate bigrams and their frequencies\n",
|
119
|
-
" bigrams = list(nltk.bigrams(words))\n",
|
120
|
-
" bigram_freq = FreqDist(bigrams)\n",
|
121
|
-
"\n",
|
122
|
-
" # Calculate mean probability (μ-value) for each bigram\n",
|
123
|
-
" mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
|
124
|
-
"\n",
|
125
|
-
" # Sort collocations by mean probability\n",
|
126
|
-
" collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
|
127
|
-
"\n",
|
128
|
-
" # Calculate mean of bigram frequencies\n",
|
129
|
-
" total_bigram_freq = sum(bigram_freq.values())\n",
|
130
|
-
" mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
|
131
|
-
"\n",
|
132
|
-
" # Print mean bigram frequency\n",
|
133
|
-
" print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")\n",
|
134
|
-
"\n",
|
135
|
-
" # Print top N collocations with their frequencies and mean probabilities\n",
|
136
|
-
" N = 10\n",
|
137
|
-
" print(\"Top\", N, \"Collocations by Mean Probability:\")\n",
|
138
|
-
" for i, (bigram, mean_prob) in enumerate(collocations[:N]):\n",
|
139
|
-
" print(f\"{i+1}. Bigram: {bigram},\\t Frequency: {bigram_freq[bigram]}, \\tMean Probability (μ-value): {mean_prob:.6f}\")\n",
|
140
|
-
"\n",
|
141
|
-
"if __name__ == \"__main__\":\n",
|
142
|
-
" main()\n"
|
143
|
-
]
|
144
|
-
},
|
145
|
-
{
|
146
|
-
"cell_type": "code",
|
147
|
-
"execution_count": 1,
|
148
|
-
"id": "68a08d04-501c-425d-a497-c7fe1d9a0013",
|
149
|
-
"metadata": {},
|
150
|
-
"outputs": [
|
151
|
-
{
|
152
|
-
"name": "stdout",
|
153
|
-
"output_type": "stream",
|
154
|
-
"text": [
|
155
|
-
"| Rank | Bigram | Frequency | Mean Probability (μ-value) |\n",
|
156
|
-
"|--------+-----------------------------+-------------+------------------------------|\n",
|
157
|
-
"| 1 | ('data', 'science') | 15 | 0.020243 |\n",
|
158
|
-
"| 2 | ('data', 'processing') | 7 | 0.009447 |\n",
|
159
|
-
"| 3 | ('predictive', 'analytics') | 5 | 0.006748 |\n",
|
160
|
-
"| 4 | ('data', 'visualization') | 5 | 0.006748 |\n",
|
161
|
-
"| 5 | ('ai', 'data') | 4 | 0.005398 |\n",
|
162
|
-
"| 6 | ('ai', 'algorithms') | 4 | 0.005398 |\n",
|
163
|
-
"| 7 | ('data', 'cleaning') | 4 | 0.005398 |\n",
|
164
|
-
"| 8 | ('cleaning', 'preparation') | 4 | 0.005398 |\n",
|
165
|
-
"| 9 | ('natural', 'language') | 4 | 0.005398 |\n",
|
166
|
-
"| 10 | ('language', 'processing') | 4 | 0.005398 |\n"
|
167
|
-
]
|
168
|
-
}
|
169
|
-
],
|
170
|
-
"source": [
|
171
|
-
"import nltk\n",
|
172
|
-
"from nltk.tokenize import word_tokenize\n",
|
173
|
-
"from nltk.corpus import stopwords\n",
|
174
|
-
"from nltk.probability import FreqDist\n",
|
175
|
-
"from tabulate import tabulate\n",
|
176
|
-
"\n",
|
177
|
-
"def preprocess_text(text: str) -> list[str]:\n",
|
178
|
-
" \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
|
179
|
-
" words = word_tokenize(text)\n",
|
180
|
-
" stop_words = set(stopwords.words('english'))\n",
|
181
|
-
" return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
|
182
|
-
"\n",
|
183
|
-
"def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
|
184
|
-
" \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
|
185
|
-
" mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
|
186
|
-
" return mean_probabilities\n",
|
187
|
-
"\n",
|
188
|
-
"def main():\n",
|
189
|
-
" # Download required NLTK data\n",
|
190
|
-
" # nltk.download('punkt')\n",
|
191
|
-
" # nltk.download('stopwords')\n",
|
192
|
-
"\n",
|
193
|
-
" # Load text\n",
|
194
|
-
" with open('text3.txt', 'r') as file:\n",
|
195
|
-
" text = file.read()\n",
|
196
|
-
"\n",
|
197
|
-
" # Preprocess text\n",
|
198
|
-
" words = preprocess_text(text)\n",
|
199
|
-
"\n",
|
200
|
-
" # Calculate word frequency distribution\n",
|
201
|
-
" fdist = FreqDist(words)\n",
|
202
|
-
"\n",
|
203
|
-
" # Calculate bigrams and their frequencies\n",
|
204
|
-
" bigrams = list(nltk.bigrams(words))\n",
|
205
|
-
" bigram_freq = FreqDist(bigrams)\n",
|
206
|
-
"\n",
|
207
|
-
" # Calculate mean probability (μ-value) for each bigram\n",
|
208
|
-
" mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
|
209
|
-
"\n",
|
210
|
-
" # Sort collocations by mean probability\n",
|
211
|
-
" collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
|
212
|
-
"\n",
|
213
|
-
" '''# Calculate mean of bigram frequencies\n",
|
214
|
-
" total_bigram_freq = sum(bigram_freq.values())\n",
|
215
|
-
" mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
|
216
|
-
"\n",
|
217
|
-
" # Print mean bigram frequency\n",
|
218
|
-
" print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")'''\n",
|
219
|
-
"\n",
|
220
|
-
" # Print top N collocations with their frequencies and mean probabilities\n",
|
221
|
-
" N = 10\n",
|
222
|
-
" headers = [\"Rank\", \"Bigram\", \"Frequency\", \"Mean Probability (μ-value)\"]\n",
|
223
|
-
" table = []\n",
|
224
|
-
" for i, (bigram, mean_prob) in enumerate(collocations[:N]):\n",
|
225
|
-
" table.append([i+1, bigram, bigram_freq[bigram], f\"{mean_prob:.6f}\"])\n",
|
226
|
-
" print(tabulate(table, headers, tablefmt=\"orgtbl\"))\n",
|
227
|
-
"\n",
|
228
|
-
"if __name__ == \"__main__\":\n",
|
229
|
-
" main()"
|
230
|
-
]
|
231
|
-
},
|
232
|
-
{
|
233
|
-
"cell_type": "code",
|
234
|
-
"execution_count": 3,
|
235
|
-
"id": "bdcf71a0-02f3-49ba-bf3b-09bb1c079fdf",
|
236
|
-
"metadata": {},
|
237
|
-
"outputs": [
|
238
|
-
{
|
239
|
-
"name": "stdout",
|
240
|
-
"output_type": "stream",
|
241
|
-
"text": [
|
242
|
-
"Mean Bigram Frequency: 1.17\n",
|
243
|
-
"\n"
|
244
|
-
]
|
245
|
-
},
|
246
|
-
{
|
247
|
-
"name": "stderr",
|
248
|
-
"output_type": "stream",
|
249
|
-
"text": [
|
250
|
-
"C:\\Users\\admin\\anaconda3\\Lib\\site-packages\\scipy\\stats\\_stats_py.py:1103: RuntimeWarning: divide by zero encountered in divide\n",
|
251
|
-
" var *= np.divide(n, n-ddof) # to avoid error on division by zero\n",
|
252
|
-
"C:\\Users\\admin\\anaconda3\\Lib\\site-packages\\scipy\\stats\\_stats_py.py:1103: RuntimeWarning: invalid value encountered in scalar multiply\n",
|
253
|
-
" var *= np.divide(n, n-ddof) # to avoid error on division by zero\n"
|
254
|
-
]
|
255
|
-
},
|
256
|
-
{
|
257
|
-
"name": "stdout",
|
258
|
-
"output_type": "stream",
|
259
|
-
"text": [
|
260
|
-
"| Rank | Bigram | Frequency | Mean Probability (μ-value) | t-Statistic | p-Value (t-Test) | Chi2 Statistic | p-Value (Chi-Square) |\n",
|
261
|
-
"|--------+--------------------------------+-------------+------------------------------+---------------+--------------------+------------------+------------------------|\n",
|
262
|
-
"| 1 | ('impact', 'artificial') | 2 | 0.002699 | nan | nan | 183.747 | 0 |\n",
|
263
|
-
"| 2 | ('artificial', 'intelligence') | 3 | 0.004049 | nan | nan | 307.077 | 0 |\n",
|
264
|
-
"| 3 | ('intelligence', 'data') | 3 | 0.004049 | nan | nan | 12.4097 | 0.0004 |\n",
|
265
|
-
"| 4 | ('data', 'science') | 15 | 0.020243 | nan | nan | 167.483 | 0 |\n",
|
266
|
-
"| 5 | ('science', 'introduction') | 1 | 0.00135 | nan | nan | 11.6208 | 0.0007 |\n",
|
267
|
-
"| 6 | ('introduction', 'recent') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
|
268
|
-
"| 7 | ('recent', 'years') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
|
269
|
-
"| 8 | ('years', 'convergence') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
|
270
|
-
"| 9 | ('convergence', 'artificial') | 1 | 0.00135 | nan | nan | 61.0835 | 0 |\n",
|
271
|
-
"| 10 | ('intelligence', 'ai') | 1 | 0.00135 | nan | nan | 0.4959 | 0.4813 |\n"
|
272
|
-
]
|
273
|
-
}
|
274
|
-
],
|
275
|
-
"source": [
|
276
|
-
"import nltk\n",
|
277
|
-
"from nltk.tokenize import word_tokenize\n",
|
278
|
-
"from nltk.corpus import stopwords\n",
|
279
|
-
"from nltk.probability import FreqDist\n",
|
280
|
-
"from tabulate import tabulate\n",
|
281
|
-
"from scipy.stats import ttest_1samp, chi2_contingency\n",
|
282
|
-
"import numpy as np\n",
|
283
|
-
"\n",
|
284
|
-
"def preprocess_text(text: str) -> list[str]:\n",
|
285
|
-
" \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
|
286
|
-
" words = word_tokenize(text)\n",
|
287
|
-
" stop_words = set(stopwords.words('english'))\n",
|
288
|
-
" return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
|
289
|
-
"\n",
|
290
|
-
"def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
|
291
|
-
" \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
|
292
|
-
" mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
|
293
|
-
" return mean_probabilities\n",
|
294
|
-
"\n",
|
295
|
-
"def perform_statistical_tests(bigram_freq: FreqDist, word_freq: FreqDist, total_bigrams: int):\n",
|
296
|
-
" \"\"\"Perform t-test and chi-square test for each bigram.\"\"\"\n",
|
297
|
-
" results = []\n",
|
298
|
-
"\n",
|
299
|
-
" for bigram, observed_freq in bigram_freq.items():\n",
|
300
|
-
" word1, word2 = bigram\n",
|
301
|
-
" freq_w1 = word_freq.get(word1, 0)\n",
|
302
|
-
" freq_w2 = word_freq.get(word2, 0)\n",
|
303
|
-
" \n",
|
304
|
-
" # Expected frequency for the bigram assuming independence\n",
|
305
|
-
" expected_freq = (freq_w1 * freq_w2) / total_bigrams\n",
|
306
|
-
" \n",
|
307
|
-
" # Chi-square test\n",
|
308
|
-
" observed = np.array([\n",
|
309
|
-
" [observed_freq, freq_w1 - observed_freq],\n",
|
310
|
-
" [freq_w2 - observed_freq, total_bigrams - (freq_w1 + freq_w2 - observed_freq)]\n",
|
311
|
-
" ])\n",
|
312
|
-
" \n",
|
313
|
-
" try:\n",
|
314
|
-
" chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)\n",
|
315
|
-
" except ValueError:\n",
|
316
|
-
" chi2_stat, p_value_chi2 = np.nan, np.nan\n",
|
317
|
-
" \n",
|
318
|
-
" # Perform one-sample t-test\n",
|
319
|
-
" sample_mean = observed_freq\n",
|
320
|
-
" sample_std = np.std([observed_freq] * 10) # Simulating 10 observations\n",
|
321
|
-
" t_stat, p_value_t = ttest_1samp([sample_mean], expected_freq)\n",
|
322
|
-
" \n",
|
323
|
-
" results.append((bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2))\n",
|
324
|
-
" \n",
|
325
|
-
" return results\n",
|
326
|
-
"\n",
|
327
|
-
"def main():\n",
|
328
|
-
" # Download required NLTK data\n",
|
329
|
-
" # nltk.download('punkt')\n",
|
330
|
-
" # nltk.download('stopwords')\n",
|
331
|
-
"\n",
|
332
|
-
" # Load text\n",
|
333
|
-
" with open(\"text3.txt\", 'r') as file:\n",
|
334
|
-
" text = file.read()\n",
|
335
|
-
"\n",
|
336
|
-
" # Preprocess text\n",
|
337
|
-
" words = preprocess_text(text)\n",
|
338
|
-
"\n",
|
339
|
-
" # Calculate word frequency distribution\n",
|
340
|
-
" word_freq = FreqDist(words)\n",
|
341
|
-
"\n",
|
342
|
-
" # Calculate bigrams and their frequencies\n",
|
343
|
-
" bigrams = list(nltk.bigrams(words))\n",
|
344
|
-
" bigram_freq = FreqDist(bigrams)\n",
|
345
|
-
"\n",
|
346
|
-
" # Calculate mean probability (μ-value) for each bigram\n",
|
347
|
-
" mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
|
348
|
-
"\n",
|
349
|
-
" # Sort collocations by mean probability\n",
|
350
|
-
" collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
|
351
|
-
"\n",
|
352
|
-
" # Calculate mean of bigram frequencies\n",
|
353
|
-
" total_bigram_freq = sum(bigram_freq.values())\n",
|
354
|
-
" mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
|
355
|
-
"\n",
|
356
|
-
" # Print mean bigram frequency\n",
|
357
|
-
" print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")\n",
|
358
|
-
"\n",
|
359
|
-
" # Perform statistical tests for each bigram\n",
|
360
|
-
" results = perform_statistical_tests(bigram_freq, word_freq, len(bigrams))\n",
|
361
|
-
"\n",
|
362
|
-
" # Print top N collocations with their frequencies and mean probabilities\n",
|
363
|
-
" N = 10\n",
|
364
|
-
" headers = [\"Rank\", \"Bigram\", \"Frequency\", \"Mean Probability (μ-value)\", \"t-Statistic\", \"p-Value (t-Test)\", \"Chi2 Statistic\", \"p-Value (Chi-Square)\"]\n",
|
365
|
-
" table = []\n",
|
366
|
-
" for i, (bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(results[:N]):\n",
|
367
|
-
" table.append([\n",
|
368
|
-
" i+1, \n",
|
369
|
-
" bigram, \n",
|
370
|
-
" observed_freq, \n",
|
371
|
-
" f\"{mean_probabilities.get(bigram, 0):.6f}\", \n",
|
372
|
-
" f\"{t_stat:.4f}\", \n",
|
373
|
-
" f\"{p_value_t:.4f}\", \n",
|
374
|
-
" f\"{chi2_stat:.4f}\", \n",
|
375
|
-
" f\"{p_value_chi2:.4f}\"\n",
|
376
|
-
" ])\n",
|
377
|
-
" \n",
|
378
|
-
" print(tabulate(table, headers, tablefmt=\"orgtbl\"))\n",
|
379
|
-
"\n",
|
380
|
-
"if __name__ == \"__main__\":\n",
|
381
|
-
" main()"
|
382
|
-
]
|
383
|
-
},
|
384
|
-
{
|
385
|
-
"cell_type": "code",
|
386
|
-
"execution_count": 6,
|
387
|
-
"id": "9b83ea08-3cf4-47db-8a13-aed1d7b21d6d",
|
388
|
-
"metadata": {},
|
389
|
-
"outputs": [
|
390
|
-
{
|
391
|
-
"name": "stdout",
|
392
|
-
"output_type": "stream",
|
393
|
-
"text": [
|
394
|
-
"Mean Bigram Frequency: 1.17\n",
|
395
|
-
"\n",
|
396
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
397
|
-
"| Rank | Bigram | Frequency | Mean Prob(μ) | t-Statistic | p-Value (t-Test) | Chi2 Statistic | p-Value (Chi-Square) |\n",
|
398
|
-
"+========+================================+=============+================+===============+====================+==================+========================+\n",
|
399
|
-
"| 1 | ('impact', 'artificial') | 2 | 0.002699 | nan | nan | 183.747 | 0 |\n",
|
400
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
401
|
-
"| 2 | ('artificial', 'intelligence') | 3 | 0.004049 | nan | nan | 307.077 | 0 |\n",
|
402
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
403
|
-
"| 3 | ('intelligence', 'data') | 3 | 0.004049 | nan | nan | 12.4097 | 0.0004 |\n",
|
404
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
405
|
-
"| 4 | ('data', 'science') | 15 | 0.020243 | nan | nan | 167.483 | 0 |\n",
|
406
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
407
|
-
"| 5 | ('science', 'introduction') | 1 | 0.00135 | nan | nan | 11.6208 | 0.0007 |\n",
|
408
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
409
|
-
"| 6 | ('introduction', 'recent') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
|
410
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
411
|
-
"| 7 | ('recent', 'years') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
|
412
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
413
|
-
"| 8 | ('years', 'convergence') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
|
414
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
415
|
-
"| 9 | ('convergence', 'artificial') | 1 | 0.00135 | nan | nan | 61.0835 | 0 |\n",
|
416
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
|
417
|
-
"| 10 | ('intelligence', 'ai') | 1 | 0.00135 | nan | nan | 0.4959 | 0.4813 |\n",
|
418
|
-
"+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n"
|
419
|
-
]
|
420
|
-
}
|
421
|
-
],
|
422
|
-
"source": [
|
423
|
-
"import nltk\n",
|
424
|
-
"from nltk.tokenize import word_tokenize\n",
|
425
|
-
"from nltk.corpus import stopwords\n",
|
426
|
-
"from nltk.probability import FreqDist\n",
|
427
|
-
"from tabulate import tabulate\n",
|
428
|
-
"from scipy.stats import ttest_1samp, chi2_contingency\n",
|
429
|
-
"import numpy as np\n",
|
430
|
-
"\n",
|
431
|
-
"def preprocess_text(text: str) -> list[str]:\n",
|
432
|
-
" \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
|
433
|
-
" words = word_tokenize(text)\n",
|
434
|
-
" stop_words = set(stopwords.words('english'))\n",
|
435
|
-
" return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
|
436
|
-
"\n",
|
437
|
-
"def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
|
438
|
-
" \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
|
439
|
-
" mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
|
440
|
-
" return mean_probabilities\n",
|
441
|
-
"\n",
|
442
|
-
"def perform_statistical_tests(bigram_freq: FreqDist, word_freq: FreqDist, total_bigrams: int):\n",
|
443
|
-
" \"\"\"Perform t-test and chi-square test for each bigram.\"\"\"\n",
|
444
|
-
" results = []\n",
|
445
|
-
"\n",
|
446
|
-
" for bigram, observed_freq in bigram_freq.items():\n",
|
447
|
-
" word1, word2 = bigram\n",
|
448
|
-
" freq_w1 = word_freq.get(word1, 0)\n",
|
449
|
-
" freq_w2 = word_freq.get(word2, 0)\n",
|
450
|
-
" \n",
|
451
|
-
" # Expected frequency for the bigram assuming independence\n",
|
452
|
-
" expected_freq = (freq_w1 * freq_w2) / total_bigrams\n",
|
453
|
-
" \n",
|
454
|
-
" # Chi-square test\n",
|
455
|
-
" observed = np.array([\n",
|
456
|
-
" [observed_freq, freq_w1 - observed_freq],\n",
|
457
|
-
" [freq_w2 - observed_freq, total_bigrams - (freq_w1 + freq_w2 - observed_freq)]\n",
|
458
|
-
" ])\n",
|
459
|
-
" \n",
|
460
|
-
" try:\n",
|
461
|
-
" chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)\n",
|
462
|
-
" except ValueError:\n",
|
463
|
-
" chi2_stat, p_value_chi2 = np.nan, np.nan\n",
|
464
|
-
" \n",
|
465
|
-
" # Perform one-sample t-test\n",
|
466
|
-
" sample_mean = observed_freq\n",
|
467
|
-
" sample_std = np.std([observed_freq] * 10) # Simulating 10 observations\n",
|
468
|
-
" \n",
|
469
|
-
" if sample_std == 0:\n",
|
470
|
-
" t_stat, p_value_t = np.nan, np.nan\n",
|
471
|
-
" else:\n",
|
472
|
-
" t_stat, p_value_t = ttest_1samp([sample_mean], expected_freq)\n",
|
473
|
-
" \n",
|
474
|
-
" results.append((bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2))\n",
|
475
|
-
" \n",
|
476
|
-
" return results\n",
|
477
|
-
"\n",
|
478
|
-
"def main():\n",
|
479
|
-
" # Download required NLTK data\n",
|
480
|
-
" #nltk.download('punkt')\n",
|
481
|
-
" #nltk.download('stopwords')\n",
|
482
|
-
"\n",
|
483
|
-
" # Load text\n",
|
484
|
-
" with open(\"text3.txt\", 'r') as file:\n",
|
485
|
-
" text = file.read()\n",
|
486
|
-
"\n",
|
487
|
-
" # Preprocess text\n",
|
488
|
-
" words = preprocess_text(text)\n",
|
489
|
-
"\n",
|
490
|
-
" # Calculate word frequency distribution\n",
|
491
|
-
" word_freq = FreqDist(words)\n",
|
492
|
-
"\n",
|
493
|
-
" # Calculate bigrams and their frequencies\n",
|
494
|
-
" bigrams = list(nltk.bigrams(words))\n",
|
495
|
-
" bigram_freq = FreqDist(bigrams)\n",
|
496
|
-
"\n",
|
497
|
-
" # Calculate mean probability (μ-value) for each bigram\n",
|
498
|
-
" mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
|
499
|
-
"\n",
|
500
|
-
" # Sort collocations by mean probability\n",
|
501
|
-
" collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
|
502
|
-
"\n",
|
503
|
-
" # Calculate mean of bigram frequencies\n",
|
504
|
-
" total_bigram_freq = sum(bigram_freq.values())\n",
|
505
|
-
" mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
|
506
|
-
"\n",
|
507
|
-
" # Print mean bigram frequency\n",
|
508
|
-
" print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")\n",
|
509
|
-
"\n",
|
510
|
-
" # Perform statistical tests for each bigram\n",
|
511
|
-
" results = perform_statistical_tests(bigram_freq, word_freq, len(bigrams))\n",
|
512
|
-
"\n",
|
513
|
-
" # Print top N collocations with their frequencies and mean probabilities\n",
|
514
|
-
" N = 10\n",
|
515
|
-
" headers = [\"Rank\", \"Bigram\", \"Frequency\", \"Mean Prob(μ)\", \"t-Statistic\", \"p-Value (t-Test)\", \"Chi2 Statistic\", \"p-Value (Chi-Square)\"]\n",
|
516
|
-
" table = []\n",
|
517
|
-
" for i, (bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(results[:N]):\n",
|
518
|
-
" table.append([\n",
|
519
|
-
" i+1, \n",
|
520
|
-
" bigram, \n",
|
521
|
-
" observed_freq, \n",
|
522
|
-
" f\"{mean_probabilities.get(bigram, 0):.6f}\", \n",
|
523
|
-
" f\"{t_stat:.4f}\" if not np.isnan(t_stat) else \"NaN\", \n",
|
524
|
-
" f\"{p_value_t:.4f}\" if not np.isnan(p_value_t) else \"NaN\", \n",
|
525
|
-
" f\"{chi2_stat:.4f}\" if not np.isnan(chi2_stat) else \"NaN\", \n",
|
526
|
-
" f\"{p_value_chi2:.4f}\" if not np.isnan(p_value_chi2) else \"NaN\"\n",
|
527
|
-
" ])\n",
|
528
|
-
" print(tabulate(table, headers, tablefmt=\"grid\"))\n",
|
529
|
-
"\n",
|
530
|
-
"if __name__ == \"__main__\":\n",
|
531
|
-
" main()"
|
532
|
-
]
|
533
|
-
},
|
534
|
-
{
|
535
|
-
"cell_type": "code",
|
536
|
-
"execution_count": 10,
|
537
|
-
"id": "841d503b-6e3c-4c37-b11f-5482cc462cad",
|
538
|
-
"metadata": {},
|
539
|
-
"outputs": [
|
540
|
-
{
|
541
|
-
"name": "stdout",
|
542
|
-
"output_type": "stream",
|
543
|
-
"text": [
|
544
|
-
"Mean Bigram Frequency: 1.17\n",
|
545
|
-
"\n",
|
546
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
547
|
-
"| Rank | Bigram | Frequency | Mean Prob(μ) | t-Statistic | p-Value(t-Test) | Chi Square | p-Value(Chi-Square) |\n",
|
548
|
-
"+========+================================+=============+================+===============+===================+==============+=======================+\n",
|
549
|
-
"| 1 | ('impact', 'artificial') | 2 | 0.002699 | 1.4152 | 0.1574 | 183.747 | 0 |\n",
|
550
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
551
|
-
"| 2 | ('artificial', 'intelligence') | 3 | 0.004049 | 1.7344 | 0.0833 | 307.077 | 0 |\n",
|
552
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
553
|
-
"| 3 | ('intelligence', 'data') | 3 | 0.004049 | 1.7344 | 0.0833 | 12.4097 | 0.0004 |\n",
|
554
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
555
|
-
"| 4 | ('data', 'science') | 15 | 0.020243 | 3.9101 | 0.0001 | 167.483 | 0 |\n",
|
556
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
557
|
-
"| 5 | ('science', 'introduction') | 1 | 0.00135 | 1 | 0.3176 | 11.6208 | 0.0007 |\n",
|
558
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
559
|
-
"| 6 | ('introduction', 'recent') | 1 | 0.00135 | 1 | 0.3176 | 184.75 | 0 |\n",
|
560
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
561
|
-
"| 7 | ('recent', 'years') | 1 | 0.00135 | 1 | 0.3176 | 184.75 | 0 |\n",
|
562
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
563
|
-
"| 8 | ('years', 'convergence') | 1 | 0.00135 | 1 | 0.3176 | 184.75 | 0 |\n",
|
564
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
565
|
-
"| 9 | ('convergence', 'artificial') | 1 | 0.00135 | 1 | 0.3176 | 61.0835 | 0 |\n",
|
566
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
|
567
|
-
"| 10 | ('intelligence', 'ai') | 1 | 0.00135 | 1 | 0.3176 | 0.4959 | 0.4813 |\n",
|
568
|
-
"+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n"
|
569
|
-
]
|
570
|
-
}
|
571
|
-
],
|
572
|
-
"source": [
|
573
|
-
"import nltk\n",
|
574
|
-
"from nltk.tokenize import word_tokenize\n",
|
575
|
-
"from nltk.corpus import stopwords\n",
|
576
|
-
"from nltk.probability import FreqDist\n",
|
577
|
-
"from tabulate import tabulate\n",
|
578
|
-
"from scipy.stats import chi2_contingency, ttest_1samp\n",
|
579
|
-
"import numpy as np\n",
|
580
|
-
"\n",
|
581
|
-
"def preprocess_text(text: str) -> list[str]:\n",
|
582
|
-
" \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
|
583
|
-
" words = word_tokenize(text)\n",
|
584
|
-
" stop_words = set(stopwords.words('english'))\n",
|
585
|
-
" return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
|
586
|
-
"\n",
|
587
|
-
"def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
|
588
|
-
" \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
|
589
|
-
" mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
|
590
|
-
" return mean_probabilities\n",
|
591
|
-
"\n",
|
592
|
-
"def perform_statistical_tests(bigram_freq: FreqDist, word_freq: FreqDist, total_bigrams: int):\n",
|
593
|
-
" \"\"\"Perform t-test and chi-square test for each bigram.\"\"\"\n",
|
594
|
-
" results = []\n",
|
595
|
-
"\n",
|
596
|
-
" for bigram, observed_freq in bigram_freq.items():\n",
|
597
|
-
" word1, word2 = bigram\n",
|
598
|
-
" freq_w1 = word_freq.get(word1, 0)\n",
|
599
|
-
" freq_w2 = word_freq.get(word2, 0)\n",
|
600
|
-
" \n",
|
601
|
-
" # Expected frequency for the bigram assuming independence\n",
|
602
|
-
" expected_freq = (freq_w1 * freq_w2) / total_bigrams\n",
|
603
|
-
" \n",
|
604
|
-
" # Chi-square test\n",
|
605
|
-
" observed = np.array([\n",
|
606
|
-
" [observed_freq, freq_w1 - observed_freq],\n",
|
607
|
-
" [freq_w2 - observed_freq, total_bigrams - (freq_w1 + freq_w2 - observed_freq)]\n",
|
608
|
-
" ])\n",
|
609
|
-
" \n",
|
610
|
-
" try:\n",
|
611
|
-
" chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)\n",
|
612
|
-
" except ValueError:\n",
|
613
|
-
" chi2_stat, p_value_chi2 = np.nan, np.nan\n",
|
614
|
-
" \n",
|
615
|
-
" # Generate sample data to perform t-test\n",
|
616
|
-
" sample_data = [observed_freq] * observed_freq + [expected_freq] * (total_bigrams - observed_freq)\n",
|
617
|
-
" \n",
|
618
|
-
" # Perform one-sample t-test\n",
|
619
|
-
" t_stat, p_value_t = ttest_1samp(sample_data, expected_freq)\n",
|
620
|
-
" \n",
|
621
|
-
" results.append((bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2))\n",
|
622
|
-
" \n",
|
623
|
-
" return results\n",
|
624
|
-
"\n",
|
625
|
-
"def main():\n",
|
626
|
-
" # Download required NLTK data\n",
|
627
|
-
" # nltk.download('punkt')\n",
|
628
|
-
" # nltk.download('stopwords')\n",
|
629
|
-
"\n",
|
630
|
-
" # Load text\n",
|
631
|
-
" with open(\"text3.txt\", 'r') as file:\n",
|
632
|
-
" text = file.read()\n",
|
633
|
-
"\n",
|
634
|
-
" # Preprocess text\n",
|
635
|
-
" words = preprocess_text(text)\n",
|
636
|
-
"\n",
|
637
|
-
" # Calculate word frequency distribution\n",
|
638
|
-
" word_freq = FreqDist(words)\n",
|
639
|
-
"\n",
|
640
|
-
" # Calculate bigrams and their frequencies\n",
|
641
|
-
" bigrams = list(nltk.bigrams(words))\n",
|
642
|
-
" bigram_freq = FreqDist(bigrams)\n",
|
643
|
-
"\n",
|
644
|
-
" # Calculate mean probability (μ-value) for each bigram\n",
|
645
|
-
" mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
|
646
|
-
"\n",
|
647
|
-
" # Sort collocations by mean probability\n",
|
648
|
-
" collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
|
649
|
-
"\n",
|
650
|
-
" # Calculate mean of bigram frequencies\n",
|
651
|
-
" total_bigram_freq = sum(bigram_freq.values())\n",
|
652
|
-
" mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
|
653
|
-
"\n",
|
654
|
-
" # Print mean bigram frequency\n",
|
655
|
-
" print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")\n",
|
656
|
-
"\n",
|
657
|
-
" # Perform statistical tests for each bigram\n",
|
658
|
-
" results = perform_statistical_tests(bigram_freq, word_freq, len(bigrams))\n",
|
659
|
-
"\n",
|
660
|
-
" # Print top N collocations with their frequencies and mean probabilities\n",
|
661
|
-
" N = 10\n",
|
662
|
-
" headers = [\"Rank\", \"Bigram\", \"Frequency\", \"Mean Prob(μ)\", \"t-Statistic\", \"p-Value(t-Test)\", \"Chi Square\", \"p-Value(Chi-Square)\"]\n",
|
663
|
-
" table = []\n",
|
664
|
-
" for i, (bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(results[:N]):\n",
|
665
|
-
" table.append([\n",
|
666
|
-
" i + 1,\n",
|
667
|
-
" bigram,\n",
|
668
|
-
" observed_freq,\n",
|
669
|
-
" f\"{mean_probabilities.get(bigram, 0):.6f}\",\n",
|
670
|
-
" f\"{t_stat:.4f}\" if not np.isnan(t_stat) else \"NaN\",\n",
|
671
|
-
" f\"{p_value_t:.4f}\" if not np.isnan(p_value_t) else \"NaN\",\n",
|
672
|
-
" f\"{chi2_stat:.4f}\" if not np.isnan(chi2_stat) else \"NaN\",\n",
|
673
|
-
" f\"{p_value_chi2:.4f}\" if not np.isnan(p_value_chi2) else \"NaN\"\n",
|
674
|
-
" ])\n",
|
675
|
-
" print(tabulate(table, headers, tablefmt=\"grid\"))\n",
|
676
|
-
"\n",
|
677
|
-
"if __name__ == \"__main__\":\n",
|
678
|
-
" main()\n"
|
679
|
-
]
|
680
|
-
},
|
681
|
-
{
|
682
|
-
"cell_type": "code",
|
683
|
-
"execution_count": null,
|
684
|
-
"id": "fcb18fd4-7444-4f9e-881e-279099049c9f",
|
685
|
-
"metadata": {},
|
686
|
-
"outputs": [],
|
687
|
-
"source": []
|
688
|
-
}
|
689
|
-
],
|
690
|
-
"metadata": {
|
691
|
-
"kernelspec": {
|
692
|
-
"display_name": "Python 3 (ipykernel)",
|
693
|
-
"language": "python",
|
694
|
-
"name": "python3"
|
695
|
-
},
|
696
|
-
"language_info": {
|
697
|
-
"codemirror_mode": {
|
698
|
-
"name": "ipython",
|
699
|
-
"version": 3
|
700
|
-
},
|
701
|
-
"file_extension": ".py",
|
702
|
-
"mimetype": "text/x-python",
|
703
|
-
"name": "python",
|
704
|
-
"nbconvert_exporter": "python",
|
705
|
-
"pygments_lexer": "ipython3",
|
706
|
-
"version": "3.11.7"
|
707
|
-
}
|
708
|
-
},
|
709
|
-
"nbformat": 4,
|
710
|
-
"nbformat_minor": 5
|
711
|
-
}
|