noshot 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. noshot/data/ML TS XAI/ML/1. PCA - EDA/PCA-EDA.ipynb +207 -0
  2. noshot/data/ML TS XAI/ML/1. PCA - EDA/balance-scale.csv +626 -0
  3. noshot/data/ML TS XAI/ML/1. PCA - EDA/input.txt +625 -0
  4. noshot/data/ML TS XAI/ML/2. KNN Classifier/KNN.ipynb +287 -0
  5. noshot/data/ML TS XAI/ML/2. KNN Classifier/balance-scale.csv +626 -0
  6. noshot/data/ML TS XAI/ML/2. KNN Classifier/input.txt +625 -0
  7. noshot/data/ML TS XAI/ML/3. Linear Discriminant Analysis/LDA.ipynb +83 -0
  8. noshot/data/ML TS XAI/ML/3. Linear Discriminant Analysis/balance-scale.csv +626 -0
  9. noshot/data/ML TS XAI/ML/3. Linear Discriminant Analysis/input.txt +625 -0
  10. noshot/data/ML TS XAI/ML/4. Linear Regression/Linear-Regression.ipynb +117 -0
  11. noshot/data/ML TS XAI/ML/4. Linear Regression/machine-data.csv +210 -0
  12. noshot/data/ML TS XAI/ML/5. Logistic Regression/Logistic-Regression.ipynb +137 -0
  13. noshot/data/ML TS XAI/ML/5. Logistic Regression/wine-dataset.csv +179 -0
  14. noshot/data/ML TS XAI/ML/6. Bayesian Classifier/Bayesian.ipynb +129 -0
  15. noshot/data/ML TS XAI/ML/6. Bayesian Classifier/wine-dataset.csv +179 -0
  16. {noshot-0.1.7.dist-info → noshot-0.1.8.dist-info}/METADATA +2 -2
  17. noshot-0.1.8.dist-info/RECORD +24 -0
  18. noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(A) Breadth First Search.ipynb +0 -112
  19. noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(B) Depth First Search.ipynb +0 -111
  20. noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(C) Uniform Cost Search.ipynb +0 -134
  21. noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(D) Depth Limites Search.ipynb +0 -115
  22. noshot/data/ML TS XAI/AIDS/1. Implement Basic Search Strategies/(E) Iterative Deepening DFS.ipynb +0 -123
  23. noshot/data/ML TS XAI/AIDS/10. ANOVA/2_ANOVA.csv +0 -769
  24. noshot/data/ML TS XAI/AIDS/10. ANOVA/One Way ANOVA (Repeated Measure).ipynb +0 -126
  25. noshot/data/ML TS XAI/AIDS/10. ANOVA/One Way ANOVA.ipynb +0 -134
  26. noshot/data/ML TS XAI/AIDS/10. ANOVA/Sample 1 Way ANOVA Test.ipynb +0 -119
  27. noshot/data/ML TS XAI/AIDS/10. ANOVA/Two Way ANOVA.ipynb +0 -138
  28. noshot/data/ML TS XAI/AIDS/10. ANOVA/reaction_time.csv +0 -5
  29. noshot/data/ML TS XAI/AIDS/10. ANOVA/sample_data.csv +0 -16
  30. noshot/data/ML TS XAI/AIDS/10. ANOVA/sleep_deprivation.csv +0 -4
  31. noshot/data/ML TS XAI/AIDS/11. Linear Regression/3_Linear.csv +0 -4802
  32. noshot/data/ML TS XAI/AIDS/11. Linear Regression/Linear Regression LAB.ipynb +0 -113
  33. noshot/data/ML TS XAI/AIDS/11. Linear Regression/Linear Regression New- sklearn.ipynb +0 -118
  34. noshot/data/ML TS XAI/AIDS/11. Linear Regression/Linear Regression.ipynb +0 -148
  35. noshot/data/ML TS XAI/AIDS/11. Linear Regression/house_rate.csv +0 -22
  36. noshot/data/ML TS XAI/AIDS/12. Logistic Regression/Logistic Regression New- sklearn.ipynb +0 -128
  37. noshot/data/ML TS XAI/AIDS/12. Logistic Regression/Logistic Regression.ipynb +0 -145
  38. noshot/data/ML TS XAI/AIDS/12. Logistic Regression/default.csv +0 -1001
  39. noshot/data/ML TS XAI/AIDS/12. Logistic Regression/hours_scores_records.csv +0 -101
  40. noshot/data/ML TS XAI/AIDS/2. Implement A Star And MA Star/(A) Astar.ipynb +0 -256
  41. noshot/data/ML TS XAI/AIDS/2. Implement A Star And MA Star/(B) IDAstar.ipynb +0 -157
  42. noshot/data/ML TS XAI/AIDS/2. Implement A Star And MA Star/(C) SMAstar.ipynb +0 -178
  43. noshot/data/ML TS XAI/AIDS/3. Genetic Algorithm/Genetic.ipynb +0 -95
  44. noshot/data/ML TS XAI/AIDS/4. Simulated Annealing/Simulated Annealing.ipynb +0 -74
  45. noshot/data/ML TS XAI/AIDS/4. Simulated Annealing/Sudoku Simulated Annealing.ipynb +0 -103
  46. noshot/data/ML TS XAI/AIDS/5. Alpha Beta Pruning/AlphaBetaPruning.ipynb +0 -182
  47. noshot/data/ML TS XAI/AIDS/6. Consraint Satisfaction Problems (CSP)/(A) CSP House Allocation.ipynb +0 -120
  48. noshot/data/ML TS XAI/AIDS/6. Consraint Satisfaction Problems (CSP)/(B) CSP Map Coloring.ipynb +0 -125
  49. noshot/data/ML TS XAI/AIDS/7. Random Sampling/Random Sampling.ipynb +0 -73
  50. noshot/data/ML TS XAI/AIDS/7. Random Sampling/height_weight_bmi.csv +0 -8389
  51. noshot/data/ML TS XAI/AIDS/8. Z Test/Z Test Hash Function.ipynb +0 -141
  52. noshot/data/ML TS XAI/AIDS/8. Z Test/Z Test.ipynb +0 -151
  53. noshot/data/ML TS XAI/AIDS/8. Z Test/height_weight_bmi.csv +0 -8389
  54. noshot/data/ML TS XAI/AIDS/9. T Test/1_heart.csv +0 -304
  55. noshot/data/ML TS XAI/AIDS/9. T Test/Independent T Test.ipynb +0 -119
  56. noshot/data/ML TS XAI/AIDS/9. T Test/Paired T Test.ipynb +0 -118
  57. noshot/data/ML TS XAI/AIDS/9. T Test/T Test Hash Function.ipynb +0 -142
  58. noshot/data/ML TS XAI/AIDS/9. T Test/T Test.ipynb +0 -158
  59. noshot/data/ML TS XAI/AIDS/9. T Test/height_weight_bmi.csv +0 -8389
  60. noshot/data/ML TS XAI/AIDS/9. T Test/iq_test.csv +0 -0
  61. noshot/data/ML TS XAI/AIDS/Others (AllinOne)/All In One.ipynb +0 -4581
  62. noshot/data/ML TS XAI/CN/1. Chat Application/chat.java +0 -81
  63. noshot/data/ML TS XAI/CN/1. Chat Application/output.png +0 -0
  64. noshot/data/ML TS XAI/CN/1. Chat Application/procedure.png +0 -0
  65. noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/LAN.tcl +0 -65
  66. noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/analysis.awk +0 -44
  67. noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/output.png +0 -0
  68. noshot/data/ML TS XAI/CN/10. Ethernet LAN IEEE 802.3/procedure.png +0 -0
  69. noshot/data/ML TS XAI/CN/11. Wireless LAN IEEE 802.11/complexdcf.tcl +0 -229
  70. noshot/data/ML TS XAI/CN/11. Wireless LAN IEEE 802.11/output.png +0 -0
  71. noshot/data/ML TS XAI/CN/11. Wireless LAN IEEE 802.11/procedure.png +0 -0
  72. noshot/data/ML TS XAI/CN/2. File Transfer/file_to_send.txt +0 -2
  73. noshot/data/ML TS XAI/CN/2. File Transfer/filetransfer.java +0 -119
  74. noshot/data/ML TS XAI/CN/2. File Transfer/output.png +0 -0
  75. noshot/data/ML TS XAI/CN/2. File Transfer/procedure.png +0 -0
  76. noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/Client.class +0 -0
  77. noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/MyServerImpl.class +0 -0
  78. noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/MyServerIntf.class +0 -0
  79. noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/Server.class +0 -0
  80. noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/output.png +0 -0
  81. noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/procedure.png +0 -0
  82. noshot/data/ML TS XAI/CN/3. RMI (Remote Method Invocation)/rmi.java +0 -56
  83. noshot/data/ML TS XAI/CN/4. Wired Network/output.png +0 -0
  84. noshot/data/ML TS XAI/CN/4. Wired Network/procedure.png +0 -0
  85. noshot/data/ML TS XAI/CN/4. Wired Network/wired.awk +0 -25
  86. noshot/data/ML TS XAI/CN/4. Wired Network/wired.tcl +0 -81
  87. noshot/data/ML TS XAI/CN/5. Wireless Network/output.png +0 -0
  88. noshot/data/ML TS XAI/CN/5. Wireless Network/procedure.png +0 -0
  89. noshot/data/ML TS XAI/CN/5. Wireless Network/wireless.awk +0 -27
  90. noshot/data/ML TS XAI/CN/5. Wireless Network/wireless.tcl +0 -153
  91. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/analysis.awk +0 -27
  92. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/output.png +0 -0
  93. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/sack.tcl +0 -86
  94. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Sack And Vegas/vegas.tcl +0 -86
  95. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/analysis.awk +0 -28
  96. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/output.png +0 -0
  97. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/reno.tcl +0 -78
  98. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Congestion Control/Tahoe And Reno/tahoe.tcl +0 -79
  99. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Flow Control/analysis.awk +0 -27
  100. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Flow Control/flow.tcl +0 -163
  101. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/TCP Flow Control/output.png +0 -0
  102. noshot/data/ML TS XAI/CN/6. TCP Flow And Congestion Control/procedure.png +0 -0
  103. noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/DV.tcl +0 -111
  104. noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/LS.tcl +0 -106
  105. noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/analysis.awk +0 -36
  106. noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/output.png +0 -0
  107. noshot/data/ML TS XAI/CN/7. Link State And Distance Vector Routing/procedure.png +0 -0
  108. noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/analysis.awk +0 -20
  109. noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/broadcast.tcl +0 -76
  110. noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/multicast.tcl +0 -103
  111. noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/output.png +0 -0
  112. noshot/data/ML TS XAI/CN/8. Multicast And Broadcast Routing/procedure.png +0 -0
  113. noshot/data/ML TS XAI/CN/9. DHCP/DHCP.java +0 -125
  114. noshot/data/ML TS XAI/CN/9. DHCP/output.png +0 -0
  115. noshot/data/ML TS XAI/CN/9. DHCP/procedure.png +0 -0
  116. noshot/data/ML TS XAI/NLP/NLP 1/1-Prereqs.py +0 -18
  117. noshot/data/ML TS XAI/NLP/NLP 1/2-Chi2test.py +0 -83
  118. noshot/data/ML TS XAI/NLP/NLP 1/2-T-test.py +0 -79
  119. noshot/data/ML TS XAI/NLP/NLP 1/3-WSD-nb.py +0 -53
  120. noshot/data/ML TS XAI/NLP/NLP 1/4-Hindle-Rooth.py +0 -53
  121. noshot/data/ML TS XAI/NLP/NLP 1/5-HMM-Trellis.py +0 -82
  122. noshot/data/ML TS XAI/NLP/NLP 1/6-HMM-Viterbi.py +0 -16
  123. noshot/data/ML TS XAI/NLP/NLP 1/7-PCFG-parsetree.py +0 -15
  124. noshot/data/ML TS XAI/NLP/NLP 1/Chi2test.ipynb +0 -285
  125. noshot/data/ML TS XAI/NLP/NLP 1/Hindle-Rooth.ipynb +0 -179
  126. noshot/data/ML TS XAI/NLP/NLP 1/Lab 10 - Text generator using LSTM.ipynb +0 -1461
  127. noshot/data/ML TS XAI/NLP/NLP 1/Lab 11 NMT.ipynb +0 -2307
  128. noshot/data/ML TS XAI/NLP/NLP 1/PCFG.ipynb +0 -134
  129. noshot/data/ML TS XAI/NLP/NLP 1/Prereqs.ipynb +0 -131
  130. noshot/data/ML TS XAI/NLP/NLP 1/T test.ipynb +0 -252
  131. noshot/data/ML TS XAI/NLP/NLP 1/TFIDF BOW.ipynb +0 -171
  132. noshot/data/ML TS XAI/NLP/NLP 1/Trellis.ipynb +0 -244
  133. noshot/data/ML TS XAI/NLP/NLP 1/WSD.ipynb +0 -645
  134. noshot/data/ML TS XAI/NLP/NLP 1/Word2Vec.ipynb +0 -93
  135. noshot/data/ML TS XAI/NLP/NLP 2/Lab01(tokenizer)/tokenizer.ipynb +0 -370
  136. noshot/data/ML TS XAI/NLP/NLP 2/Lab01(tokenizer)/training_tokenizer.txt +0 -6
  137. noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/exp0.ipynb +0 -274
  138. noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/lab2.ipynb +0 -905
  139. noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/test.txt +0 -1
  140. noshot/data/ML TS XAI/NLP/NLP 2/Lab02(stemming)/tokenizing.ipynb +0 -272
  141. noshot/data/ML TS XAI/NLP/NLP 2/Lab03(parse-tree)/collocation.ipynb +0 -332
  142. noshot/data/ML TS XAI/NLP/NLP 2/Lab03(parse-tree)/lab3.ipynb +0 -549
  143. noshot/data/ML TS XAI/NLP/NLP 2/Lab03(parse-tree)/nlp.txt +0 -1
  144. noshot/data/ML TS XAI/NLP/NLP 2/Lab04(collocation)/Lab4-NLP-Exp-2.ipynb +0 -817
  145. noshot/data/ML TS XAI/NLP/NLP 2/Lab04(collocation)/collocation.ipynb +0 -332
  146. noshot/data/ML TS XAI/NLP/NLP 2/Lab05(WSD)/NLP-Lab-5-Exp3.ipynb +0 -231
  147. noshot/data/ML TS XAI/NLP/NLP 2/Lab05(WSD)/word-sense-disambiguation.ipynb +0 -507
  148. noshot/data/ML TS XAI/NLP/NLP 2/Lab06(additional-exercise)/lab6.ipynb +0 -134
  149. noshot/data/ML TS XAI/NLP/NLP 2/Lab07(HMM,Viterbi)/NLP Exp 4.ipynb +0 -255
  150. noshot/data/ML TS XAI/NLP/NLP 2/Lab07(HMM,Viterbi)/NLP_Exp_5.ipynb +0 -159
  151. noshot/data/ML TS XAI/NLP/NLP 2/Lab08(PCFG)/PCFG.ipynb +0 -282
  152. noshot/data/ML TS XAI/NLP/NLP 2/Lab09-Hindle-rooth&MLP/Lab 9 - MLP classifier.ipynb +0 -670
  153. noshot/data/ML TS XAI/NLP/NLP 2/Lab09-Hindle-rooth&MLP/MLP-alternative-code.ipynb +0 -613
  154. noshot/data/ML TS XAI/NLP/NLP 2/Lab09-Hindle-rooth&MLP/hindle-rooth-algorithm.ipynb +0 -74
  155. noshot/data/ML TS XAI/NLP/NLP 2/Lab10(LSTM)/Lab_10_Text_generator_using_LSTM.ipynb +0 -480
  156. noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Machine-translation.ipynb +0 -445
  157. noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/Viterbi-PCFG.ipynb +0 -105
  158. noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/corpora_tools.py +0 -87
  159. noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/data_utils.py +0 -11
  160. noshot/data/ML TS XAI/NLP/NLP 2/Lab11(Viterbi-PCFG,Machine-translation)/train_translator.py +0 -83
  161. noshot/data/ML TS XAI/NLP/NLP 2/Lab12(Information-Extraction)/Information_Extraction.ipynb +0 -201
  162. noshot/data/ML TS XAI/NLP/NLP 3/Backtrack-without-Verbitri.ipynb +0 -185
  163. noshot/data/ML TS XAI/NLP/NLP 3/Backward-Procedure.ipynb +0 -597
  164. noshot/data/ML TS XAI/NLP/NLP 3/Bag_of.ipynb +0 -1422
  165. noshot/data/ML TS XAI/NLP/NLP 3/CYK-algorithm.ipynb +0 -1067
  166. noshot/data/ML TS XAI/NLP/NLP 3/Forward-Procedure.ipynb +0 -477
  167. noshot/data/ML TS XAI/NLP/NLP 3/LSTM.ipynb +0 -1290
  168. noshot/data/ML TS XAI/NLP/NLP 3/Lab 10 - Text generator using LSTM.ipynb +0 -1461
  169. noshot/data/ML TS XAI/NLP/NLP 3/Lab 11 NMT.ipynb +0 -2307
  170. noshot/data/ML TS XAI/NLP/NLP 3/NLP-LAB-4.ipynb +0 -216
  171. noshot/data/ML TS XAI/NLP/NLP 3/NLP-LAB-5.ipynb +0 -216
  172. noshot/data/ML TS XAI/NLP/NLP 3/abc.txt +0 -6
  173. noshot/data/ML TS XAI/NLP/NLP 3/ex-1-nltk.ipynb +0 -711
  174. noshot/data/ML TS XAI/NLP/NLP 3/ex-2-nlp.ipynb +0 -267
  175. noshot/data/ML TS XAI/NLP/NLP 3/exp8&9.ipynb +0 -305
  176. noshot/data/ML TS XAI/NLP/NLP 3/hind.ipynb +0 -287
  177. noshot/data/ML TS XAI/NLP/NLP 3/lab66.ipynb +0 -752
  178. noshot/data/ML TS XAI/NLP/NLP 3/leb_3.ipynb +0 -612
  179. noshot/data/ML TS XAI/NLP/NLP 3/naive_bayes_classifier.pkl +0 -0
  180. noshot/data/ML TS XAI/NLP/NLP 3/nlp_leb_1.ipynb +0 -3008
  181. noshot/data/ML TS XAI/NLP/NLP 3/nlp_leb_2.ipynb +0 -3095
  182. noshot/data/ML TS XAI/NLP/NLP 3/nlplab-9.ipynb +0 -295
  183. noshot/data/ML TS XAI/NLP/NLP 3/nltk-ex-4.ipynb +0 -506
  184. noshot/data/ML TS XAI/NLP/NLP 3/text1.txt +0 -48
  185. noshot/data/ML TS XAI/NLP/NLP 3/text2.txt +0 -8
  186. noshot/data/ML TS XAI/NLP/NLP 3/text3.txt +0 -48
  187. noshot/data/ML TS XAI/NLP/NLP 3/translation-rnn.ipynb +0 -812
  188. noshot/data/ML TS XAI/NLP/NLP 3/word2vector.ipynb +0 -173
  189. noshot/data/ML TS XAI/NLP/NLP 4/Backward Procedure Algorithm.ipynb +0 -179
  190. noshot/data/ML TS XAI/NLP/NLP 4/Chi Square Collocation.ipynb +0 -208
  191. noshot/data/ML TS XAI/NLP/NLP 4/Collocation (T test).ipynb +0 -188
  192. noshot/data/ML TS XAI/NLP/NLP 4/Experiment 1.ipynb +0 -437
  193. noshot/data/ML TS XAI/NLP/NLP 4/Forward Procedure Algorithm.ipynb +0 -132
  194. noshot/data/ML TS XAI/NLP/NLP 4/Hindle Rooth.ipynb +0 -414
  195. noshot/data/ML TS XAI/NLP/NLP 4/MachineTranslation.ipynb +0 -368
  196. noshot/data/ML TS XAI/NLP/NLP 4/Multi Layer Perceptron using MLPClassifier.ipynb +0 -86
  197. noshot/data/ML TS XAI/NLP/NLP 4/Multi Layer Perceptron using Tensorflow.ipynb +0 -112
  198. noshot/data/ML TS XAI/NLP/NLP 4/PCFG Inside Probability.ipynb +0 -451
  199. noshot/data/ML TS XAI/NLP/NLP 4/Text Generation using LSTM.ipynb +0 -297
  200. noshot/data/ML TS XAI/NLP/NLP 4/Viterbi.ipynb +0 -310
  201. noshot/data/ML TS XAI/NLP/NLP 4/Word Sense Disambiguation.ipynb +0 -335
  202. noshot/data/ML TS XAI/NLP/NLP 5/10.Text Generation using LSTM.ipynb +0 -316
  203. noshot/data/ML TS XAI/NLP/NLP 5/11.Machine Translation.ipynb +0 -868
  204. noshot/data/ML TS XAI/NLP/NLP 5/2.T and Chi2 Test.ipynb +0 -204
  205. noshot/data/ML TS XAI/NLP/NLP 5/3.Word Sense Diambiguation.ipynb +0 -234
  206. noshot/data/ML TS XAI/NLP/NLP 5/4.Hinddle and Rooth.ipynb +0 -128
  207. noshot/data/ML TS XAI/NLP/NLP 5/5.Forward and Backward.ipynb +0 -149
  208. noshot/data/ML TS XAI/NLP/NLP 5/6.Viterbi.ipynb +0 -111
  209. noshot/data/ML TS XAI/NLP/NLP 5/7.PCFG Parse Tree.ipynb +0 -134
  210. noshot/data/ML TS XAI/NLP/NLP 5/7.PCFG using cyk.ipynb +0 -101
  211. noshot/data/ML TS XAI/NLP/NLP 5/8.Bag of words and TF-IDF.ipynb +0 -310
  212. noshot/data/ML TS XAI/NLP/NLP 5/9.Word2Vector.ipynb +0 -78
  213. noshot/data/ML TS XAI/NLP/NLP 5/NLP ALL In One.ipynb +0 -2619
  214. noshot/data/ML TS XAI/NLP/NLP 5/sample1.txt +0 -15
  215. noshot/data/ML TS XAI/NLP/NLP 5/sample2.txt +0 -4
  216. noshot/data/ML TS XAI/NLP/NLP 5/word2vec_model.bin +0 -0
  217. noshot/data/ML TS XAI/NLP/NLP 6/1. Tokenize, Tagging, NER, Parse Tree.ipynb +0 -312
  218. noshot/data/ML TS XAI/NLP/NLP 6/2. T Test and Chi2 Test.ipynb +0 -185
  219. noshot/data/ML TS XAI/NLP/NLP 6/3. Naive Bayes WSD.ipynb +0 -199
  220. noshot/data/ML TS XAI/NLP/NLP 6/4. Hinddle and Rooth.ipynb +0 -151
  221. noshot/data/ML TS XAI/NLP/NLP 6/5 and 6 FWD, BWD, Viterbi.ipynb +0 -164
  222. noshot/data/ML TS XAI/NLP/NLP 6/7. PCFG using CYK.ipynb +0 -383
  223. noshot/data/ML TS XAI/NLP/NLP 6/8. BOW and TF-IDF.ipynb +0 -252
  224. noshot/data/ML TS XAI/Ubuntu CN Lab.iso +0 -0
  225. noshot-0.1.7.dist-info/RECORD +0 -216
  226. {noshot-0.1.7.dist-info → noshot-0.1.8.dist-info}/LICENSE.txt +0 -0
  227. {noshot-0.1.7.dist-info → noshot-0.1.8.dist-info}/WHEEL +0 -0
  228. {noshot-0.1.7.dist-info → noshot-0.1.8.dist-info}/top_level.txt +0 -0
@@ -1,711 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 2,
6
- "id": "62a71ce0-0a21-45ef-9204-16d27ff27b89",
7
- "metadata": {},
8
- "outputs": [
9
- {
10
- "name": "stdout",
11
- "output_type": "stream",
12
- "text": [
13
- "Top 10 Collocations:\n",
14
- "1. Bigram: ('accurate', 'actionable'), Frequency: 1\n",
15
- "2. Bigram: ('accurately', 'model'), Frequency: 1\n",
16
- "3. Bigram: ('action', 'supply'), Frequency: 1\n",
17
- "4. Bigram: ('adjusting', 'parameters'), Frequency: 1\n",
18
- "5. Bigram: ('advancements', 'technology'), Frequency: 1\n",
19
- "6. Bigram: ('allow', 'users'), Frequency: 1\n",
20
- "7. Bigram: ('anomalies', 'triggering'), Frequency: 1\n",
21
- "8. Bigram: ('assess', 'credit'), Frequency: 1\n",
22
- "9. Bigram: ('bandwidth', 'requirements'), Frequency: 1\n",
23
- "10. Bigram: ('becomes', 'available'), Frequency: 1\n"
24
- ]
25
- }
26
- ],
27
- "source": [
28
- "import nltk\n",
29
- "from nltk.tokenize import word_tokenize\n",
30
- "from nltk.corpus import stopwords\n",
31
- "from nltk.probability import FreqDist\n",
32
- "from nltk.collocations import BigramCollocationFinder\n",
33
- "from nltk.metrics import BigramAssocMeasures\n",
34
- "\n",
35
- "# Download required NLTK data\n",
36
- "#nltk.download('punkt')\n",
37
- "#nltk.download('stopwords')\n",
38
- "\n",
39
- "# Load text\n",
40
- "with open('text3.txt', 'r') as file:\n",
41
- " text = file.read()\n",
42
- "\n",
43
- "# Preprocess text\n",
44
- "words = [word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stopwords.words('english')]\n",
45
- "\n",
46
- "# Calculate bigrams and their frequencies\n",
47
- "bigram_finder = BigramCollocationFinder.from_words(words)\n",
48
- "bigram_freq = bigram_finder.ngram_fd\n",
49
- "\n",
50
- "# Calculate top collocations\n",
51
- "bigram_measures = BigramAssocMeasures()\n",
52
- "collocations = bigram_finder.nbest(bigram_measures.pmi, 10)\n",
53
- "\n",
54
- "# Print top collocations\n",
55
- "print(\"Top 10 Collocations:\")\n",
56
- "for i, collocation in enumerate(collocations):\n",
57
- " print(f\"{i+1}. Bigram: {collocation}, Frequency: {bigram_freq[collocation]}\")"
58
- ]
59
- },
60
- {
61
- "cell_type": "code",
62
- "execution_count": 23,
63
- "id": "d3ef1337-2135-48cd-b1e8-8aa0ab375450",
64
- "metadata": {},
65
- "outputs": [
66
- {
67
- "name": "stdout",
68
- "output_type": "stream",
69
- "text": [
70
- "Mean Bigram Frequency: 1.17\n",
71
- "\n",
72
- "Top 10 Collocations by Mean Probability:\n",
73
- "1. Bigram: ('data', 'science'),\t Frequency: 15, \tMean Probability (μ-value): 0.020243\n",
74
- "2. Bigram: ('data', 'processing'),\t Frequency: 7, \tMean Probability (μ-value): 0.009447\n",
75
- "3. Bigram: ('predictive', 'analytics'),\t Frequency: 5, \tMean Probability (μ-value): 0.006748\n",
76
- "4. Bigram: ('data', 'visualization'),\t Frequency: 5, \tMean Probability (μ-value): 0.006748\n",
77
- "5. Bigram: ('ai', 'data'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
78
- "6. Bigram: ('ai', 'algorithms'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
79
- "7. Bigram: ('data', 'cleaning'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
80
- "8. Bigram: ('cleaning', 'preparation'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
81
- "9. Bigram: ('natural', 'language'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n",
82
- "10. Bigram: ('language', 'processing'),\t Frequency: 4, \tMean Probability (μ-value): 0.005398\n"
83
- ]
84
- }
85
- ],
86
- "source": [
87
- "import nltk\n",
88
- "from nltk.tokenize import word_tokenize\n",
89
- "from nltk.corpus import stopwords\n",
90
- "from nltk.probability import FreqDist\n",
91
- "\n",
92
- "def preprocess_text(text: str) -> list[str]:\n",
93
- " \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
94
- " words = word_tokenize(text)\n",
95
- " stop_words = set(stopwords.words('english'))\n",
96
- " return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
97
- "\n",
98
- "def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
99
- " \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
100
- " mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
101
- " return mean_probabilities\n",
102
- "\n",
103
- "def main():\n",
104
- " # Download required NLTK data\n",
105
- " # nltk.download('punkt')\n",
106
- " # nltk.download('stopwords')\n",
107
- "\n",
108
- " # Load text\n",
109
- " with open('text3.txt', 'r') as file:\n",
110
- " text = file.read()\n",
111
- "\n",
112
- " # Preprocess text\n",
113
- " words = preprocess_text(text)\n",
114
- "\n",
115
- " # Calculate word frequency distribution\n",
116
- " fdist = FreqDist(words)\n",
117
- "\n",
118
- " # Calculate bigrams and their frequencies\n",
119
- " bigrams = list(nltk.bigrams(words))\n",
120
- " bigram_freq = FreqDist(bigrams)\n",
121
- "\n",
122
- " # Calculate mean probability (μ-value) for each bigram\n",
123
- " mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
124
- "\n",
125
- " # Sort collocations by mean probability\n",
126
- " collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
127
- "\n",
128
- " # Calculate mean of bigram frequencies\n",
129
- " total_bigram_freq = sum(bigram_freq.values())\n",
130
- " mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
131
- "\n",
132
- " # Print mean bigram frequency\n",
133
- " print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")\n",
134
- "\n",
135
- " # Print top N collocations with their frequencies and mean probabilities\n",
136
- " N = 10\n",
137
- " print(\"Top\", N, \"Collocations by Mean Probability:\")\n",
138
- " for i, (bigram, mean_prob) in enumerate(collocations[:N]):\n",
139
- " print(f\"{i+1}. Bigram: {bigram},\\t Frequency: {bigram_freq[bigram]}, \\tMean Probability (μ-value): {mean_prob:.6f}\")\n",
140
- "\n",
141
- "if __name__ == \"__main__\":\n",
142
- " main()\n"
143
- ]
144
- },
145
- {
146
- "cell_type": "code",
147
- "execution_count": 1,
148
- "id": "68a08d04-501c-425d-a497-c7fe1d9a0013",
149
- "metadata": {},
150
- "outputs": [
151
- {
152
- "name": "stdout",
153
- "output_type": "stream",
154
- "text": [
155
- "| Rank | Bigram | Frequency | Mean Probability (μ-value) |\n",
156
- "|--------+-----------------------------+-------------+------------------------------|\n",
157
- "| 1 | ('data', 'science') | 15 | 0.020243 |\n",
158
- "| 2 | ('data', 'processing') | 7 | 0.009447 |\n",
159
- "| 3 | ('predictive', 'analytics') | 5 | 0.006748 |\n",
160
- "| 4 | ('data', 'visualization') | 5 | 0.006748 |\n",
161
- "| 5 | ('ai', 'data') | 4 | 0.005398 |\n",
162
- "| 6 | ('ai', 'algorithms') | 4 | 0.005398 |\n",
163
- "| 7 | ('data', 'cleaning') | 4 | 0.005398 |\n",
164
- "| 8 | ('cleaning', 'preparation') | 4 | 0.005398 |\n",
165
- "| 9 | ('natural', 'language') | 4 | 0.005398 |\n",
166
- "| 10 | ('language', 'processing') | 4 | 0.005398 |\n"
167
- ]
168
- }
169
- ],
170
- "source": [
171
- "import nltk\n",
172
- "from nltk.tokenize import word_tokenize\n",
173
- "from nltk.corpus import stopwords\n",
174
- "from nltk.probability import FreqDist\n",
175
- "from tabulate import tabulate\n",
176
- "\n",
177
- "def preprocess_text(text: str) -> list[str]:\n",
178
- " \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
179
- " words = word_tokenize(text)\n",
180
- " stop_words = set(stopwords.words('english'))\n",
181
- " return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
182
- "\n",
183
- "def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
184
- " \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
185
- " mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
186
- " return mean_probabilities\n",
187
- "\n",
188
- "def main():\n",
189
- " # Download required NLTK data\n",
190
- " # nltk.download('punkt')\n",
191
- " # nltk.download('stopwords')\n",
192
- "\n",
193
- " # Load text\n",
194
- " with open('text3.txt', 'r') as file:\n",
195
- " text = file.read()\n",
196
- "\n",
197
- " # Preprocess text\n",
198
- " words = preprocess_text(text)\n",
199
- "\n",
200
- " # Calculate word frequency distribution\n",
201
- " fdist = FreqDist(words)\n",
202
- "\n",
203
- " # Calculate bigrams and their frequencies\n",
204
- " bigrams = list(nltk.bigrams(words))\n",
205
- " bigram_freq = FreqDist(bigrams)\n",
206
- "\n",
207
- " # Calculate mean probability (μ-value) for each bigram\n",
208
- " mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
209
- "\n",
210
- " # Sort collocations by mean probability\n",
211
- " collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
212
- "\n",
213
- " '''# Calculate mean of bigram frequencies\n",
214
- " total_bigram_freq = sum(bigram_freq.values())\n",
215
- " mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
216
- "\n",
217
- " # Print mean bigram frequency\n",
218
- " print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")'''\n",
219
- "\n",
220
- " # Print top N collocations with their frequencies and mean probabilities\n",
221
- " N = 10\n",
222
- " headers = [\"Rank\", \"Bigram\", \"Frequency\", \"Mean Probability (μ-value)\"]\n",
223
- " table = []\n",
224
- " for i, (bigram, mean_prob) in enumerate(collocations[:N]):\n",
225
- " table.append([i+1, bigram, bigram_freq[bigram], f\"{mean_prob:.6f}\"])\n",
226
- " print(tabulate(table, headers, tablefmt=\"orgtbl\"))\n",
227
- "\n",
228
- "if __name__ == \"__main__\":\n",
229
- " main()"
230
- ]
231
- },
232
- {
233
- "cell_type": "code",
234
- "execution_count": 3,
235
- "id": "bdcf71a0-02f3-49ba-bf3b-09bb1c079fdf",
236
- "metadata": {},
237
- "outputs": [
238
- {
239
- "name": "stdout",
240
- "output_type": "stream",
241
- "text": [
242
- "Mean Bigram Frequency: 1.17\n",
243
- "\n"
244
- ]
245
- },
246
- {
247
- "name": "stderr",
248
- "output_type": "stream",
249
- "text": [
250
- "C:\\Users\\admin\\anaconda3\\Lib\\site-packages\\scipy\\stats\\_stats_py.py:1103: RuntimeWarning: divide by zero encountered in divide\n",
251
- " var *= np.divide(n, n-ddof) # to avoid error on division by zero\n",
252
- "C:\\Users\\admin\\anaconda3\\Lib\\site-packages\\scipy\\stats\\_stats_py.py:1103: RuntimeWarning: invalid value encountered in scalar multiply\n",
253
- " var *= np.divide(n, n-ddof) # to avoid error on division by zero\n"
254
- ]
255
- },
256
- {
257
- "name": "stdout",
258
- "output_type": "stream",
259
- "text": [
260
- "| Rank | Bigram | Frequency | Mean Probability (μ-value) | t-Statistic | p-Value (t-Test) | Chi2 Statistic | p-Value (Chi-Square) |\n",
261
- "|--------+--------------------------------+-------------+------------------------------+---------------+--------------------+------------------+------------------------|\n",
262
- "| 1 | ('impact', 'artificial') | 2 | 0.002699 | nan | nan | 183.747 | 0 |\n",
263
- "| 2 | ('artificial', 'intelligence') | 3 | 0.004049 | nan | nan | 307.077 | 0 |\n",
264
- "| 3 | ('intelligence', 'data') | 3 | 0.004049 | nan | nan | 12.4097 | 0.0004 |\n",
265
- "| 4 | ('data', 'science') | 15 | 0.020243 | nan | nan | 167.483 | 0 |\n",
266
- "| 5 | ('science', 'introduction') | 1 | 0.00135 | nan | nan | 11.6208 | 0.0007 |\n",
267
- "| 6 | ('introduction', 'recent') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
268
- "| 7 | ('recent', 'years') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
269
- "| 8 | ('years', 'convergence') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
270
- "| 9 | ('convergence', 'artificial') | 1 | 0.00135 | nan | nan | 61.0835 | 0 |\n",
271
- "| 10 | ('intelligence', 'ai') | 1 | 0.00135 | nan | nan | 0.4959 | 0.4813 |\n"
272
- ]
273
- }
274
- ],
275
- "source": [
276
- "import nltk\n",
277
- "from nltk.tokenize import word_tokenize\n",
278
- "from nltk.corpus import stopwords\n",
279
- "from nltk.probability import FreqDist\n",
280
- "from tabulate import tabulate\n",
281
- "from scipy.stats import ttest_1samp, chi2_contingency\n",
282
- "import numpy as np\n",
283
- "\n",
284
- "def preprocess_text(text: str) -> list[str]:\n",
285
- " \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
286
- " words = word_tokenize(text)\n",
287
- " stop_words = set(stopwords.words('english'))\n",
288
- " return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
289
- "\n",
290
- "def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
291
- " \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
292
- " mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
293
- " return mean_probabilities\n",
294
- "\n",
295
- "def perform_statistical_tests(bigram_freq: FreqDist, word_freq: FreqDist, total_bigrams: int):\n",
296
- " \"\"\"Perform t-test and chi-square test for each bigram.\"\"\"\n",
297
- " results = []\n",
298
- "\n",
299
- " for bigram, observed_freq in bigram_freq.items():\n",
300
- " word1, word2 = bigram\n",
301
- " freq_w1 = word_freq.get(word1, 0)\n",
302
- " freq_w2 = word_freq.get(word2, 0)\n",
303
- " \n",
304
- " # Expected frequency for the bigram assuming independence\n",
305
- " expected_freq = (freq_w1 * freq_w2) / total_bigrams\n",
306
- " \n",
307
- " # Chi-square test\n",
308
- " observed = np.array([\n",
309
- " [observed_freq, freq_w1 - observed_freq],\n",
310
- " [freq_w2 - observed_freq, total_bigrams - (freq_w1 + freq_w2 - observed_freq)]\n",
311
- " ])\n",
312
- " \n",
313
- " try:\n",
314
- " chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)\n",
315
- " except ValueError:\n",
316
- " chi2_stat, p_value_chi2 = np.nan, np.nan\n",
317
- " \n",
318
- " # Perform one-sample t-test\n",
319
- " sample_mean = observed_freq\n",
320
- " sample_std = np.std([observed_freq] * 10) # Simulating 10 observations\n",
321
- " t_stat, p_value_t = ttest_1samp([sample_mean], expected_freq)\n",
322
- " \n",
323
- " results.append((bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2))\n",
324
- " \n",
325
- " return results\n",
326
- "\n",
327
- "def main():\n",
328
- " # Download required NLTK data\n",
329
- " # nltk.download('punkt')\n",
330
- " # nltk.download('stopwords')\n",
331
- "\n",
332
- " # Load text\n",
333
- " with open(\"text3.txt\", 'r') as file:\n",
334
- " text = file.read()\n",
335
- "\n",
336
- " # Preprocess text\n",
337
- " words = preprocess_text(text)\n",
338
- "\n",
339
- " # Calculate word frequency distribution\n",
340
- " word_freq = FreqDist(words)\n",
341
- "\n",
342
- " # Calculate bigrams and their frequencies\n",
343
- " bigrams = list(nltk.bigrams(words))\n",
344
- " bigram_freq = FreqDist(bigrams)\n",
345
- "\n",
346
- " # Calculate mean probability (μ-value) for each bigram\n",
347
- " mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
348
- "\n",
349
- " # Sort collocations by mean probability\n",
350
- " collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
351
- "\n",
352
- " # Calculate mean of bigram frequencies\n",
353
- " total_bigram_freq = sum(bigram_freq.values())\n",
354
- " mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
355
- "\n",
356
- " # Print mean bigram frequency\n",
357
- " print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")\n",
358
- "\n",
359
- " # Perform statistical tests for each bigram\n",
360
- " results = perform_statistical_tests(bigram_freq, word_freq, len(bigrams))\n",
361
- "\n",
362
- " # Print top N collocations with their frequencies and mean probabilities\n",
363
- " N = 10\n",
364
- " headers = [\"Rank\", \"Bigram\", \"Frequency\", \"Mean Probability (μ-value)\", \"t-Statistic\", \"p-Value (t-Test)\", \"Chi2 Statistic\", \"p-Value (Chi-Square)\"]\n",
365
- " table = []\n",
366
- " for i, (bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(results[:N]):\n",
367
- " table.append([\n",
368
- " i+1, \n",
369
- " bigram, \n",
370
- " observed_freq, \n",
371
- " f\"{mean_probabilities.get(bigram, 0):.6f}\", \n",
372
- " f\"{t_stat:.4f}\", \n",
373
- " f\"{p_value_t:.4f}\", \n",
374
- " f\"{chi2_stat:.4f}\", \n",
375
- " f\"{p_value_chi2:.4f}\"\n",
376
- " ])\n",
377
- " \n",
378
- " print(tabulate(table, headers, tablefmt=\"orgtbl\"))\n",
379
- "\n",
380
- "if __name__ == \"__main__\":\n",
381
- " main()"
382
- ]
383
- },
384
- {
385
- "cell_type": "code",
386
- "execution_count": 6,
387
- "id": "9b83ea08-3cf4-47db-8a13-aed1d7b21d6d",
388
- "metadata": {},
389
- "outputs": [
390
- {
391
- "name": "stdout",
392
- "output_type": "stream",
393
- "text": [
394
- "Mean Bigram Frequency: 1.17\n",
395
- "\n",
396
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
397
- "| Rank | Bigram | Frequency | Mean Prob(μ) | t-Statistic | p-Value (t-Test) | Chi2 Statistic | p-Value (Chi-Square) |\n",
398
- "+========+================================+=============+================+===============+====================+==================+========================+\n",
399
- "| 1 | ('impact', 'artificial') | 2 | 0.002699 | nan | nan | 183.747 | 0 |\n",
400
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
401
- "| 2 | ('artificial', 'intelligence') | 3 | 0.004049 | nan | nan | 307.077 | 0 |\n",
402
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
403
- "| 3 | ('intelligence', 'data') | 3 | 0.004049 | nan | nan | 12.4097 | 0.0004 |\n",
404
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
405
- "| 4 | ('data', 'science') | 15 | 0.020243 | nan | nan | 167.483 | 0 |\n",
406
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
407
- "| 5 | ('science', 'introduction') | 1 | 0.00135 | nan | nan | 11.6208 | 0.0007 |\n",
408
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
409
- "| 6 | ('introduction', 'recent') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
410
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
411
- "| 7 | ('recent', 'years') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
412
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
413
- "| 8 | ('years', 'convergence') | 1 | 0.00135 | nan | nan | 184.75 | 0 |\n",
414
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
415
- "| 9 | ('convergence', 'artificial') | 1 | 0.00135 | nan | nan | 61.0835 | 0 |\n",
416
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n",
417
- "| 10 | ('intelligence', 'ai') | 1 | 0.00135 | nan | nan | 0.4959 | 0.4813 |\n",
418
- "+--------+--------------------------------+-------------+----------------+---------------+--------------------+------------------+------------------------+\n"
419
- ]
420
- }
421
- ],
422
- "source": [
423
- "import nltk\n",
424
- "from nltk.tokenize import word_tokenize\n",
425
- "from nltk.corpus import stopwords\n",
426
- "from nltk.probability import FreqDist\n",
427
- "from tabulate import tabulate\n",
428
- "from scipy.stats import ttest_1samp, chi2_contingency\n",
429
- "import numpy as np\n",
430
- "\n",
431
- "def preprocess_text(text: str) -> list[str]:\n",
432
- " \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
433
- " words = word_tokenize(text)\n",
434
- " stop_words = set(stopwords.words('english'))\n",
435
- " return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
436
- "\n",
437
- "def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
438
- " \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
439
- " mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
440
- " return mean_probabilities\n",
441
- "\n",
442
- "def perform_statistical_tests(bigram_freq: FreqDist, word_freq: FreqDist, total_bigrams: int):\n",
443
- " \"\"\"Perform t-test and chi-square test for each bigram.\"\"\"\n",
444
- " results = []\n",
445
- "\n",
446
- " for bigram, observed_freq in bigram_freq.items():\n",
447
- " word1, word2 = bigram\n",
448
- " freq_w1 = word_freq.get(word1, 0)\n",
449
- " freq_w2 = word_freq.get(word2, 0)\n",
450
- " \n",
451
- " # Expected frequency for the bigram assuming independence\n",
452
- " expected_freq = (freq_w1 * freq_w2) / total_bigrams\n",
453
- " \n",
454
- " # Chi-square test\n",
455
- " observed = np.array([\n",
456
- " [observed_freq, freq_w1 - observed_freq],\n",
457
- " [freq_w2 - observed_freq, total_bigrams - (freq_w1 + freq_w2 - observed_freq)]\n",
458
- " ])\n",
459
- " \n",
460
- " try:\n",
461
- " chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)\n",
462
- " except ValueError:\n",
463
- " chi2_stat, p_value_chi2 = np.nan, np.nan\n",
464
- " \n",
465
- " # Perform one-sample t-test\n",
466
- " sample_mean = observed_freq\n",
467
- " sample_std = np.std([observed_freq] * 10) # Simulating 10 observations\n",
468
- " \n",
469
- " if sample_std == 0:\n",
470
- " t_stat, p_value_t = np.nan, np.nan\n",
471
- " else:\n",
472
- " t_stat, p_value_t = ttest_1samp([sample_mean], expected_freq)\n",
473
- " \n",
474
- " results.append((bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2))\n",
475
- " \n",
476
- " return results\n",
477
- "\n",
478
- "def main():\n",
479
- " # Download required NLTK data\n",
480
- " #nltk.download('punkt')\n",
481
- " #nltk.download('stopwords')\n",
482
- "\n",
483
- " # Load text\n",
484
- " with open(\"text3.txt\", 'r') as file:\n",
485
- " text = file.read()\n",
486
- "\n",
487
- " # Preprocess text\n",
488
- " words = preprocess_text(text)\n",
489
- "\n",
490
- " # Calculate word frequency distribution\n",
491
- " word_freq = FreqDist(words)\n",
492
- "\n",
493
- " # Calculate bigrams and their frequencies\n",
494
- " bigrams = list(nltk.bigrams(words))\n",
495
- " bigram_freq = FreqDist(bigrams)\n",
496
- "\n",
497
- " # Calculate mean probability (μ-value) for each bigram\n",
498
- " mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
499
- "\n",
500
- " # Sort collocations by mean probability\n",
501
- " collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
502
- "\n",
503
- " # Calculate mean of bigram frequencies\n",
504
- " total_bigram_freq = sum(bigram_freq.values())\n",
505
- " mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
506
- "\n",
507
- " # Print mean bigram frequency\n",
508
- " print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")\n",
509
- "\n",
510
- " # Perform statistical tests for each bigram\n",
511
- " results = perform_statistical_tests(bigram_freq, word_freq, len(bigrams))\n",
512
- "\n",
513
- " # Print top N collocations with their frequencies and mean probabilities\n",
514
- " N = 10\n",
515
- " headers = [\"Rank\", \"Bigram\", \"Frequency\", \"Mean Prob(μ)\", \"t-Statistic\", \"p-Value (t-Test)\", \"Chi2 Statistic\", \"p-Value (Chi-Square)\"]\n",
516
- " table = []\n",
517
- " for i, (bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(results[:N]):\n",
518
- " table.append([\n",
519
- " i+1, \n",
520
- " bigram, \n",
521
- " observed_freq, \n",
522
- " f\"{mean_probabilities.get(bigram, 0):.6f}\", \n",
523
- " f\"{t_stat:.4f}\" if not np.isnan(t_stat) else \"NaN\", \n",
524
- " f\"{p_value_t:.4f}\" if not np.isnan(p_value_t) else \"NaN\", \n",
525
- " f\"{chi2_stat:.4f}\" if not np.isnan(chi2_stat) else \"NaN\", \n",
526
- " f\"{p_value_chi2:.4f}\" if not np.isnan(p_value_chi2) else \"NaN\"\n",
527
- " ])\n",
528
- " print(tabulate(table, headers, tablefmt=\"grid\"))\n",
529
- "\n",
530
- "if __name__ == \"__main__\":\n",
531
- " main()"
532
- ]
533
- },
534
- {
535
- "cell_type": "code",
536
- "execution_count": 10,
537
- "id": "841d503b-6e3c-4c37-b11f-5482cc462cad",
538
- "metadata": {},
539
- "outputs": [
540
- {
541
- "name": "stdout",
542
- "output_type": "stream",
543
- "text": [
544
- "Mean Bigram Frequency: 1.17\n",
545
- "\n",
546
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
547
- "| Rank | Bigram | Frequency | Mean Prob(μ) | t-Statistic | p-Value(t-Test) | Chi Square | p-Value(Chi-Square) |\n",
548
- "+========+================================+=============+================+===============+===================+==============+=======================+\n",
549
- "| 1 | ('impact', 'artificial') | 2 | 0.002699 | 1.4152 | 0.1574 | 183.747 | 0 |\n",
550
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
551
- "| 2 | ('artificial', 'intelligence') | 3 | 0.004049 | 1.7344 | 0.0833 | 307.077 | 0 |\n",
552
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
553
- "| 3 | ('intelligence', 'data') | 3 | 0.004049 | 1.7344 | 0.0833 | 12.4097 | 0.0004 |\n",
554
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
555
- "| 4 | ('data', 'science') | 15 | 0.020243 | 3.9101 | 0.0001 | 167.483 | 0 |\n",
556
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
557
- "| 5 | ('science', 'introduction') | 1 | 0.00135 | 1 | 0.3176 | 11.6208 | 0.0007 |\n",
558
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
559
- "| 6 | ('introduction', 'recent') | 1 | 0.00135 | 1 | 0.3176 | 184.75 | 0 |\n",
560
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
561
- "| 7 | ('recent', 'years') | 1 | 0.00135 | 1 | 0.3176 | 184.75 | 0 |\n",
562
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
563
- "| 8 | ('years', 'convergence') | 1 | 0.00135 | 1 | 0.3176 | 184.75 | 0 |\n",
564
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
565
- "| 9 | ('convergence', 'artificial') | 1 | 0.00135 | 1 | 0.3176 | 61.0835 | 0 |\n",
566
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n",
567
- "| 10 | ('intelligence', 'ai') | 1 | 0.00135 | 1 | 0.3176 | 0.4959 | 0.4813 |\n",
568
- "+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+\n"
569
- ]
570
- }
571
- ],
572
- "source": [
573
- "import nltk\n",
574
- "from nltk.tokenize import word_tokenize\n",
575
- "from nltk.corpus import stopwords\n",
576
- "from nltk.probability import FreqDist\n",
577
- "from tabulate import tabulate\n",
578
- "from scipy.stats import chi2_contingency, ttest_1samp\n",
579
- "import numpy as np\n",
580
- "\n",
581
- "def preprocess_text(text: str) -> list[str]:\n",
582
- " \"\"\"Preprocesses the text by tokenizing, removing punctuation and stopwords.\"\"\"\n",
583
- " words = word_tokenize(text)\n",
584
- " stop_words = set(stopwords.words('english'))\n",
585
- " return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]\n",
586
- "\n",
587
- "def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:\n",
588
- " \"\"\"Calculates the mean probability (μ-value) of each bigram.\"\"\"\n",
589
- " mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}\n",
590
- " return mean_probabilities\n",
591
- "\n",
592
- "def perform_statistical_tests(bigram_freq: FreqDist, word_freq: FreqDist, total_bigrams: int):\n",
593
- " \"\"\"Perform t-test and chi-square test for each bigram.\"\"\"\n",
594
- " results = []\n",
595
- "\n",
596
- " for bigram, observed_freq in bigram_freq.items():\n",
597
- " word1, word2 = bigram\n",
598
- " freq_w1 = word_freq.get(word1, 0)\n",
599
- " freq_w2 = word_freq.get(word2, 0)\n",
600
- " \n",
601
- " # Expected frequency for the bigram assuming independence\n",
602
- " expected_freq = (freq_w1 * freq_w2) / total_bigrams\n",
603
- " \n",
604
- " # Chi-square test\n",
605
- " observed = np.array([\n",
606
- " [observed_freq, freq_w1 - observed_freq],\n",
607
- " [freq_w2 - observed_freq, total_bigrams - (freq_w1 + freq_w2 - observed_freq)]\n",
608
- " ])\n",
609
- " \n",
610
- " try:\n",
611
- " chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)\n",
612
- " except ValueError:\n",
613
- " chi2_stat, p_value_chi2 = np.nan, np.nan\n",
614
- " \n",
615
- " # Generate sample data to perform t-test\n",
616
- " sample_data = [observed_freq] * observed_freq + [expected_freq] * (total_bigrams - observed_freq)\n",
617
- " \n",
618
- " # Perform one-sample t-test\n",
619
- " t_stat, p_value_t = ttest_1samp(sample_data, expected_freq)\n",
620
- " \n",
621
- " results.append((bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2))\n",
622
- " \n",
623
- " return results\n",
624
- "\n",
625
- "def main():\n",
626
- " # Download required NLTK data\n",
627
- " # nltk.download('punkt')\n",
628
- " # nltk.download('stopwords')\n",
629
- "\n",
630
- " # Load text\n",
631
- " with open(\"text3.txt\", 'r') as file:\n",
632
- " text = file.read()\n",
633
- "\n",
634
- " # Preprocess text\n",
635
- " words = preprocess_text(text)\n",
636
- "\n",
637
- " # Calculate word frequency distribution\n",
638
- " word_freq = FreqDist(words)\n",
639
- "\n",
640
- " # Calculate bigrams and their frequencies\n",
641
- " bigrams = list(nltk.bigrams(words))\n",
642
- " bigram_freq = FreqDist(bigrams)\n",
643
- "\n",
644
- " # Calculate mean probability (μ-value) for each bigram\n",
645
- " mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))\n",
646
- "\n",
647
- " # Sort collocations by mean probability\n",
648
- " collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)\n",
649
- "\n",
650
- " # Calculate mean of bigram frequencies\n",
651
- " total_bigram_freq = sum(bigram_freq.values())\n",
652
- " mean_bigram_freq = total_bigram_freq / len(bigram_freq)\n",
653
- "\n",
654
- " # Print mean bigram frequency\n",
655
- " print(f\"Mean Bigram Frequency: {mean_bigram_freq:.2f}\\n\")\n",
656
- "\n",
657
- " # Perform statistical tests for each bigram\n",
658
- " results = perform_statistical_tests(bigram_freq, word_freq, len(bigrams))\n",
659
- "\n",
660
- " # Print top N collocations with their frequencies and mean probabilities\n",
661
- " N = 10\n",
662
- " headers = [\"Rank\", \"Bigram\", \"Frequency\", \"Mean Prob(μ)\", \"t-Statistic\", \"p-Value(t-Test)\", \"Chi Square\", \"p-Value(Chi-Square)\"]\n",
663
- " table = []\n",
664
- " for i, (bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(results[:N]):\n",
665
- " table.append([\n",
666
- " i + 1,\n",
667
- " bigram,\n",
668
- " observed_freq,\n",
669
- " f\"{mean_probabilities.get(bigram, 0):.6f}\",\n",
670
- " f\"{t_stat:.4f}\" if not np.isnan(t_stat) else \"NaN\",\n",
671
- " f\"{p_value_t:.4f}\" if not np.isnan(p_value_t) else \"NaN\",\n",
672
- " f\"{chi2_stat:.4f}\" if not np.isnan(chi2_stat) else \"NaN\",\n",
673
- " f\"{p_value_chi2:.4f}\" if not np.isnan(p_value_chi2) else \"NaN\"\n",
674
- " ])\n",
675
- " print(tabulate(table, headers, tablefmt=\"grid\"))\n",
676
- "\n",
677
- "if __name__ == \"__main__\":\n",
678
- " main()\n"
679
- ]
680
- },
681
- {
682
- "cell_type": "code",
683
- "execution_count": null,
684
- "id": "fcb18fd4-7444-4f9e-881e-279099049c9f",
685
- "metadata": {},
686
- "outputs": [],
687
- "source": []
688
- }
689
- ],
690
- "metadata": {
691
- "kernelspec": {
692
- "display_name": "Python 3 (ipykernel)",
693
- "language": "python",
694
- "name": "python3"
695
- },
696
- "language_info": {
697
- "codemirror_mode": {
698
- "name": "ipython",
699
- "version": 3
700
- },
701
- "file_extension": ".py",
702
- "mimetype": "text/x-python",
703
- "name": "python",
704
- "nbconvert_exporter": "python",
705
- "pygments_lexer": "ipython3",
706
- "version": "3.11.7"
707
- }
708
- },
709
- "nbformat": 4,
710
- "nbformat_minor": 5
711
- }