wikipedia-vandalism_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.txt +4 -0
  5. data/README.md +265 -0
  6. data/Rakefile +12 -0
  7. data/lib/java/LibSVM.jar +0 -0
  8. data/lib/java/SMOTE.jar +0 -0
  9. data/lib/java/balancedRandomForest.jar +0 -0
  10. data/lib/java/diffutils-1.3.0.jar +0 -0
  11. data/lib/java/libsvm.jar +0 -0
  12. data/lib/java/oneClassClassifier.jar +0 -0
  13. data/lib/java/realAdaBoost.jar +0 -0
  14. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  15. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  16. data/lib/weka/classifiers/functions/lib_svm.rb +15 -0
  17. data/lib/weka/classifiers/meta/one_class_classifier.rb +25 -0
  18. data/lib/weka/classifiers/meta/real_ada_boost.rb +17 -0
  19. data/lib/weka/classifiers/trees/balanced_random_forest.rb +18 -0
  20. data/lib/weka/filters/supervised/instance/smote.rb +22 -0
  21. data/lib/wikipedia.rb +51 -0
  22. data/lib/wikipedia/vandalism_detection.rb +30 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +18 -0
  24. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +69 -0
  25. data/lib/wikipedia/vandalism_detection/classifier.rb +186 -0
  26. data/lib/wikipedia/vandalism_detection/configuration.rb +321 -0
  27. data/lib/wikipedia/vandalism_detection/diff.rb +27 -0
  28. data/lib/wikipedia/vandalism_detection/edit.rb +75 -0
  29. data/lib/wikipedia/vandalism_detection/evaluator.rb +606 -0
  30. data/lib/wikipedia/vandalism_detection/exceptions.rb +40 -0
  31. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +89 -0
  32. data/lib/wikipedia/vandalism_detection/features.rb +67 -0
  33. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +23 -0
  34. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +22 -0
  35. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +19 -0
  36. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +29 -0
  37. data/lib/wikipedia/vandalism_detection/features/article_size.rb +18 -0
  38. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +23 -0
  39. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/base.rb +54 -0
  41. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +23 -0
  42. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +22 -0
  43. data/lib/wikipedia/vandalism_detection/features/blanking.rb +25 -0
  44. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +25 -0
  45. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +22 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +22 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +17 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +27 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +22 -0
  51. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +22 -0
  52. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +22 -0
  53. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +27 -0
  54. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +18 -0
  55. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +18 -0
  56. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +24 -0
  57. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +65 -0
  58. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  59. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  60. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +20 -0
  61. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +22 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +22 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +18 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  65. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +20 -0
  66. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +18 -0
  67. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +20 -0
  68. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +27 -0
  69. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +27 -0
  70. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +24 -0
  71. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +18 -0
  72. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +23 -0
  73. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +23 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +23 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +22 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +27 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +28 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +23 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +23 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +20 -0
  83. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +23 -0
  84. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +18 -0
  85. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +22 -0
  86. data/lib/wikipedia/vandalism_detection/features/reverted.rb +18 -0
  87. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +29 -0
  89. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +23 -0
  90. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +21 -0
  91. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +22 -0
  92. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +26 -0
  93. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +25 -0
  94. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +31 -0
  95. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +22 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +24 -0
  97. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +31 -0
  98. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +24 -0
  99. data/lib/wikipedia/vandalism_detection/features/user_reputation.rb +38 -0
  100. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +23 -0
  101. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +22 -0
  102. data/lib/wikipedia/vandalism_detection/features/weekday.rb +21 -0
  103. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +22 -0
  104. data/lib/wikipedia/vandalism_detection/instances.rb +130 -0
  105. data/lib/wikipedia/vandalism_detection/page.rb +88 -0
  106. data/lib/wikipedia/vandalism_detection/page_parser.rb +52 -0
  107. data/lib/wikipedia/vandalism_detection/revision.rb +69 -0
  108. data/lib/wikipedia/vandalism_detection/revision_parser.rb +43 -0
  109. data/lib/wikipedia/vandalism_detection/test_dataset.rb +367 -0
  110. data/lib/wikipedia/vandalism_detection/text.rb +18 -0
  111. data/lib/wikipedia/vandalism_detection/training_dataset.rb +303 -0
  112. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  113. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists.rb +19 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +12 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +21 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +22 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +12 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +15 -0
  120. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +12 -0
  121. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +97 -0
  122. data/spec/factories/edit.rb +20 -0
  123. data/spec/factories/page.rb +13 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/config.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +22 -0
  152. data/spec/support/macros/file_reading.rb +7 -0
  153. data/spec/support/macros/test_configuration.rb +71 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +36 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +317 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +517 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +137 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +671 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +128 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +36 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +58 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +61 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +23 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +35 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +36 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +59 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +49 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +36 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +58 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +38 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +35 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +37 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +34 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +34 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +27 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +34 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +34 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +34 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +34 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +42 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +33 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +33 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +35 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +49 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +36 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +51 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +26 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +41 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +46 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +35 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +35 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +35 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +35 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +35 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +36 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +59 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +35 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +26 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +36 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +59 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +36 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +36 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +36 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +46 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +36 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +36 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +36 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +36 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +35 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +36 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +35 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +44 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +28 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +46 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +60 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +36 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +59 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +35 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +57 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +38 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +50 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +22 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +35 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +37 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +35 -0
  227. data/spec/vandalism_detection/features/user_reputation_spec.rb +52 -0
  228. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +36 -0
  229. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +58 -0
  230. data/spec/vandalism_detection/features/weekday_spec.rb +22 -0
  231. data/spec/vandalism_detection/features/words_increment_spec.rb +35 -0
  232. data/spec/vandalism_detection/instances_spec.rb +156 -0
  233. data/spec/vandalism_detection/page_parser_spec.rb +184 -0
  234. data/spec/vandalism_detection/page_spec.rb +135 -0
  235. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  236. data/spec/vandalism_detection/revision_spec.rb +115 -0
  237. data/spec/vandalism_detection/test_dataset_spec.rb +231 -0
  238. data/spec/vandalism_detection/text_spec.rb +29 -0
  239. data/spec/vandalism_detection/training_dataset_spec.rb +264 -0
  240. data/spec/vandalism_detection/wikitext_extractor_spec.rb +72 -0
  241. data/spec/weka/classifiers/functions/lib_svm_spec.rb +38 -0
  242. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +76 -0
  243. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +40 -0
  244. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +40 -0
  245. data/spec/weka/filters/supervised/instance/smote_spec.rb +6 -0
  246. data/wikipedia-vandalism_detection.gemspec +30 -0
  247. metadata +512 -0
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Diff do
4
+
5
+ it "can deal with invalid byte sequences" do
6
+ text = "text \255".force_encoding('UTF-8')
7
+ expect { Wikipedia::VandalismDetection::Diff.new("#{text} a", "#{text} b") }.not_to raise_error
8
+ end
9
+
10
+ before do
11
+ @old_text = Wikipedia::VandalismDetection::Text.new "hello\nworld\nmy name is Luke\n"
12
+ @new_text = Wikipedia::VandalismDetection::Text.new "world\nhello\nmy name is Mr. Skywalker\n"
13
+ @diff = Wikipedia::VandalismDetection::Diff.new(@old_text, @new_text)
14
+ end
15
+
16
+ describe "getting the inserted and removed words" do
17
+
18
+ it "can return the added words as array" do
19
+ inserted_words = @diff.inserted_words
20
+
21
+ expect(inserted_words).to be_an Array
22
+ expect(inserted_words.count).to eq 3
23
+ end
24
+
25
+ it "can return the removed words as array" do
26
+ removed_words = @diff.removed_words
27
+
28
+ expect(removed_words).to be_an Array
29
+ expect(removed_words.count).to eq 2
30
+ end
31
+
32
+ it "returns the right inserted words" do
33
+ expect(@diff.inserted_words).to eq ['hello', 'Mr.', 'Skywalker']
34
+ end
35
+
36
+ it "returns the right removed words" do
37
+ expect(@diff.removed_words).to eq ['hello', 'Luke']
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,137 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Edit do
4
+
5
+ before do
6
+ @old_revision = build :old_revision
7
+ @new_revision = build :new_revision
8
+ @page_id = '1234'
9
+
10
+ @edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision)
11
+ end
12
+
13
+ it "has an old revision" do
14
+ expect(@edit.old_revision).to eq @old_revision
15
+ end
16
+
17
+ it "has a new revision" do
18
+ expect(@edit.new_revision).to eq @new_revision
19
+ end
20
+
21
+ it "can be build with its parent page referenced" do
22
+ page = build(:page, id: '1234', title: 'Page Title')
23
+ edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision, page: page)
24
+ expect(edit.page).to eq page
25
+ end
26
+
27
+ it "can be build with a page to get the id" do
28
+ page_id = '1234'
29
+ page = Wikipedia::VandalismDetection::Page.new
30
+ page.id = page_id
31
+
32
+ edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision, page: page)
33
+ expect(edit.page.id).to eq page_id
34
+ end
35
+
36
+ it "can be build with a page to get the title" do
37
+ page = Wikipedia::VandalismDetection::Page.new
38
+ page_title = 'Article'
39
+ page.title = page_title
40
+
41
+ edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision, page: page)
42
+ expect(edit.page.title).to eq page_title
43
+ end
44
+
45
+ describe "exception handling" do
46
+ it "does not raise an error if page parameters are called" do
47
+ edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision)
48
+ expect { edit.page.id }.not_to raise_error
49
+ end
50
+
51
+ it "raises no error if revisions are not sequent" do
52
+ expect { Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision) }.not_to raise_error
53
+ end
54
+
55
+ it "raises an error if revisions are not sequent" do
56
+ expect { Wikipedia::VandalismDetection::Edit.new(@new_revision, @old_revision) }.to raise_exception ArgumentError
57
+ end
58
+ end
59
+
60
+ describe "#serialize" do
61
+ it "serializes the given parameters into a string" do
62
+ expect(@edit.serialize(:id, :text)).to eq "1,text 1\t2,text 2"
63
+ end
64
+ end
65
+
66
+ describe "#inserted_words" do
67
+ it "returns the inserted words as array" do
68
+ old_revision = build(:old_revision, text: "")
69
+ new_revision = build(:new_revision, text: "inserted words")
70
+ edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
71
+
72
+ expect(edit.inserted_words).to eq ['inserted', 'words']
73
+ end
74
+
75
+ it "returns the uncleaned text inserted words as array" do
76
+ old_revision = build(:old_revision, text: "")
77
+ new_revision = build(:new_revision, text: "[[inserted words]]")
78
+ edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
79
+
80
+ expect(edit.inserted_words).to eq ['[[inserted', 'words]]']
81
+ end
82
+ end
83
+
84
+ describe "#inserted_text" do
85
+ it "returns the inserted text as Wikipedia::VandalismDetection::Text" do
86
+ old_revision = build(:old_revision, text: "")
87
+ new_revision = build(:new_revision, text: "inserted words")
88
+ edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
89
+
90
+ expect(edit.inserted_text).to eq Wikipedia::VandalismDetection::Text.new('inserted words')
91
+ end
92
+
93
+ it "returns the uncleaned text inserted text as Wikipedia::VadalismDetection::Text" do
94
+ old_revision = build(:old_revision, text: "")
95
+ new_revision = build(:new_revision, text: "[[inserted words]]")
96
+ edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
97
+
98
+ expect(edit.inserted_text).to eq Wikipedia::VandalismDetection::Text.new('[[inserted words]]')
99
+ end
100
+ end
101
+
102
+ describe "#removed_words" do
103
+ it "returns the removed words as array" do
104
+ old_revision = build(:old_revision, text: "removed words")
105
+ new_revision = build(:new_revision, text: "")
106
+ edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
107
+
108
+ expect(edit.removed_words).to eq ['removed', 'words']
109
+ end
110
+
111
+ it "returns the uncleaned text rremoved words as array" do
112
+ old_revision = build(:old_revision, text: "[[removed words]]")
113
+ new_revision = build(:new_revision, text: "")
114
+ edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
115
+
116
+ expect(edit.removed_words).to eq ['[[removed', 'words]]']
117
+ end
118
+ end
119
+
120
+ describe "#removed_text" do
121
+ it "returns the removed text as Wikipedia::VandalismDetection::Text" do
122
+ old_revision = build(:old_revision, text: "removed words")
123
+ new_revision = build(:new_revision, text: "")
124
+ edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
125
+
126
+ expect(edit.removed_text).to eq Wikipedia::VandalismDetection::Text.new('removed words')
127
+ end
128
+
129
+ it "returns the uncleaned text removed text as Wikipedia::VadalismDetection::Text" do
130
+ old_revision = build(:old_revision, text: "[[removed words]]")
131
+ new_revision = build(:new_revision, text: "")
132
+ edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
133
+
134
+ expect(edit.removed_text).to eq Wikipedia::VandalismDetection::Text.new('[[removed words]]')
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,671 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Evaluator do
4
+
5
+ before do
6
+ use_test_configuration
7
+ @config = test_config
8
+
9
+ @training_arff_file = @config.training_output_arff_file
10
+ @test_arff_file = @config.test_output_arff_file
11
+ @build_dir = @config.output_base_directory
12
+ @test_classification_file = @config.test_output_classification_file
13
+ end
14
+
15
+ after do
16
+ # remove training arff file
17
+ if File.exists?(@training_arff_file)
18
+ File.delete(@training_arff_file)
19
+ FileUtils.rm_r(File.dirname @training_arff_file)
20
+ end
21
+
22
+ # remove test arff file
23
+ if File.exists?(@test_arff_file)
24
+ File.delete(@test_arff_file)
25
+ FileUtils.rm_r(File.dirname @test_arff_file)
26
+ end
27
+
28
+ # remove classification.txt
29
+ if File.exist?(@test_classification_file)
30
+ File.delete(@test_classification_file)
31
+ File.rm_r(File.dirname @test_classification_file)
32
+ end
33
+
34
+ # remove output base directory
35
+ if Dir.exists?(@build_dir)
36
+ FileUtils.rm_r(@build_dir)
37
+ end
38
+ end
39
+
40
+ describe "#initialize" do
41
+
42
+ it "raises an ArgumentError if classifier attr is not a Wikipedia::VandalismDetection::Classfier" do
43
+ expect { Wikipedia::VandalismDetection::Evaluator.new("") }.to raise_error ArgumentError
44
+ end
45
+
46
+ it "does not raise an error while appropriate initialization" do
47
+ classifier = Wikipedia::VandalismDetection::Classifier.new
48
+ expect { Wikipedia::VandalismDetection::Evaluator.new(classifier) }.not_to raise_error
49
+ end
50
+ end
51
+
52
+ before do
53
+ classifier = Wikipedia::VandalismDetection::Classifier.new
54
+ @evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
55
+ end
56
+
57
+ describe "#test_performance_curves" do
58
+
59
+ before do
60
+ @classification = {
61
+ :"1-2" => {
62
+ old_revision_id: 1,
63
+ new_revision_id: 2,
64
+ class: "R",
65
+ confidence: 0.0
66
+ },
67
+ :"2-3" => {
68
+ old_revision_id: 2,
69
+ new_revision_id: 3,
70
+ class: "R",
71
+ confidence: 0.3
72
+ },
73
+ :"3-4" => {
74
+ old_revision_id: 3,
75
+ new_revision_id: 4,
76
+ class: "V",
77
+ confidence: 0.8
78
+ },
79
+ :"4-5" => {
80
+ old_revision_id: 4,
81
+ new_revision_id: 5,
82
+ class: "V",
83
+ confidence: 1.0
84
+ }
85
+ }
86
+
87
+ # ground truth has one sample more to represent fall-out samples while feature calculation
88
+ # (e.g. redirects are not considered)
89
+ @ground_truth = {
90
+ :"0-1" => { # this is a sample that is not used!
91
+ old_revision_id: 0,
92
+ new_revision_id: 1,
93
+ class: "R"
94
+ },
95
+ :"1-2" => {
96
+ old_revision_id: 1,
97
+ new_revision_id: 2,
98
+ class: "R"
99
+ },
100
+ :"2-3" => {
101
+ old_revision_id: 2,
102
+ new_revision_id: 3,
103
+ class: "V"
104
+ },
105
+ :"3-4" => {
106
+ old_revision_id: 3,
107
+ new_revision_id: 4,
108
+ class: "R"
109
+ },
110
+ :"4-5" => {
111
+ old_revision_id: 4,
112
+ new_revision_id: 5,
113
+ class: "V"
114
+ }
115
+ }
116
+
117
+ @sample_count = 10
118
+
119
+ @curve_data = @evaluator.test_performance_curves(@ground_truth, @classification, @sample_count)
120
+ end
121
+
122
+ it "returns a Hash" do
123
+ expect(@curve_data).to be_a Hash
124
+ end
125
+
126
+ [:recalls, :precisions,:fp_rates, :tp_rates, :pr_auc, :roc_auc].each do |attribute|
127
+ it "returns a Hash including #{attribute}" do
128
+ expect(@curve_data).to have_key(attribute)
129
+ end
130
+ end
131
+
132
+ describe "#predictive_values" do
133
+
134
+ before do
135
+ @threshold = 0.5
136
+ @predictive_values = @evaluator.predictive_values(@ground_truth, @classification, @threshold)
137
+ end
138
+
139
+ it "returns a Hash" do
140
+ expect(@predictive_values).to be_a Hash
141
+ end
142
+
143
+ [
144
+ { threshold: 0.0, result: {tp: 2, fp: 2, tn: 0, fn: 0} },
145
+ { threshold: 0.3, result: {tp: 1, fp: 1, tn: 1, fn: 1} },
146
+ { threshold: 0.5, result: {tp: 1, fp: 1, tn: 1, fn: 1} },
147
+ { threshold: 0.8, result: {tp: 1, fp: 1, tn: 1, fn: 1} },
148
+ { threshold: 0.9, result: {tp: 1, fp: 0, tn: 2, fn: 1} },
149
+ { threshold: 1.0, result: {tp: 0, fp: 0, tn: 2, fn: 2} }
150
+ ].each do |values|
151
+ it "returns the right values for threshold #{values[:threshold]}" do
152
+ predictive_values = @evaluator.predictive_values(@ground_truth, @classification, values[:threshold])
153
+ expect(predictive_values).to eq values[:result]
154
+ end
155
+ end
156
+ end
157
+
158
+ describe "#sort_curve_values" do
159
+
160
+ before do
161
+ @x = [0.7, 0.4, 0.8, 0.4, 0.7]
162
+ @y = [0.6, 0.8, 0.2, 0.6, 0.6]
163
+
164
+ @x_sorted = [0.4, 0.4, 0.7, 0.8]
165
+ @y_sorted = [0.8, 0.6, 0.6, 0.2]
166
+ end
167
+
168
+ it "returns the unique sorted input values" do
169
+ hash = { x: @x_sorted, y: @y_sorted }
170
+ sorted = @evaluator.sort_curve_values(@x, @y)
171
+
172
+ expect(sorted).to eq hash
173
+ end
174
+
175
+ it "adds start values if given" do
176
+ start_values = { x: -1.0, y: -2.0 }
177
+ hash = { x: @x_sorted.unshift(start_values[:x]), y: @y_sorted.unshift(start_values[:y])}
178
+ sorted = @evaluator.sort_curve_values(@x, @y, start_values)
179
+
180
+ expect(sorted).to eq hash
181
+ end
182
+
183
+ it "adds x start value if only one value given" do
184
+ start_values = { x: -1.0 }
185
+ hash = { x: @x_sorted.unshift(start_values[:x]), y: @y_sorted.unshift(@y_sorted.first) }
186
+ sorted = @evaluator.sort_curve_values(@x, @y, start_values)
187
+
188
+ expect(sorted).to eq hash
189
+ end
190
+
191
+ it "adds y start value if only one value given" do
192
+ start_values = { y: -2.0 }
193
+ hash = { x: @x_sorted.unshift(@x_sorted.first), y: @y_sorted.unshift(start_values[:y]) }
194
+ sorted = @evaluator.sort_curve_values(@x, @y, start_values)
195
+
196
+ expect(sorted).to eq hash
197
+ end
198
+
199
+ it "adds end values if given" do
200
+ end_values = { x: -1.0, y: -2.0 }
201
+ hash = { x: @x_sorted.push(end_values[:x]), y: @y_sorted.push(end_values[:y]) }
202
+ sorted = @evaluator.sort_curve_values(@x, @y, nil, end_values)
203
+
204
+ expect(sorted).to eq hash
205
+ end
206
+
207
+ it "adds y end values if only one value is given" do
208
+ end_values = {y: -2.0 }
209
+ hash = { x: @x_sorted.push(@x_sorted.last), y: @y_sorted.push(end_values[:y]) }
210
+ sorted = @evaluator.sort_curve_values(@x, @y, nil, end_values)
211
+
212
+ expect(sorted).to eq hash
213
+ end
214
+
215
+ it "adds x end values if only one value is given" do
216
+ end_values = {x: -1.0 }
217
+ hash = { x: @x_sorted.push(end_values[:x]), y: @y_sorted.push(@y_sorted.last) }
218
+ sorted = @evaluator.sort_curve_values(@x, @y, nil, end_values)
219
+
220
+ expect(sorted).to eq hash
221
+ end
222
+ end
223
+
224
+ describe "#area_under_curve" do
225
+
226
+ before do
227
+ @pr_auc = @evaluator.area_under_curve(@curve_data[:precisions], @curve_data[:precisions])
228
+ @roc_auc = @evaluator.area_under_curve(@curve_data[:fp_rates], @curve_data[:tp_rates])
229
+ end
230
+
231
+ it "returns a numeric value for pr_auc" do
232
+ expect(@pr_auc).to be_a Numeric
233
+ end
234
+
235
+ it "returns a numeric value between 0.0 & 1.0 for pr_auc" do
236
+ is_between_zero_and_one = (@pr_auc >= 0.0 && @pr_auc <= 1.0)
237
+ expect(is_between_zero_and_one).to be true
238
+ end
239
+
240
+ it "returns a numeric value for roc_auc" do
241
+ expect(@roc_auc).to be_a Numeric
242
+ end
243
+
244
+ it "returns a numeric value between 0.0 & 1.0 for roc_auc" do
245
+ is_between_zero_and_one = @roc_auc >= 0.0 && @roc_auc <= 1.0
246
+ expect(is_between_zero_and_one).to be true
247
+ end
248
+
249
+ [
250
+ { x: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0], y: [1.0, 0.8, 0.6, 0.4, 0.2, 0.0], auc: 0.5 }
251
+ ].each do |data|
252
+ it "returns the right values" do
253
+ x = data[:x]
254
+ y = data[:y]
255
+ auc = data[:auc]
256
+
257
+ expect(@evaluator.area_under_curve(x, y)).to eq auc
258
+ end
259
+ end
260
+
261
+ end
262
+ end
263
+
264
+ describe "#create_testcorpus_classification_file!" do
265
+
266
+ before do
267
+ @ground_truth = { # see resources file ground_truth.csv
268
+ :"0-1" => { # this is a sample that is not used!
269
+ old_revision_id: 0,
270
+ new_revision_id: 1,
271
+ class: "R"
272
+ },
273
+ :"307084144-326873205" => {
274
+ old_revision_id: 307084144,
275
+ new_revision_id: 326873205,
276
+ class: "R"
277
+ },
278
+ :"326471754-326978767" => {
279
+ old_revision_id: 326471754,
280
+ new_revision_id: 326978767,
281
+ class: "V"
282
+ },
283
+ :"328774035-328774110" => {
284
+ old_revision_id: 328774035,
285
+ new_revision_id: 328774110,
286
+ class: "R"
287
+ }
288
+ }
289
+ end
290
+
291
+ it "raises an argument error if ground_truth param is nil" do
292
+ expect { @evaluator.create_testcorpus_classification_file!(@test_classification_file, nil) }.to raise_error ArgumentError
293
+ end
294
+
295
+ it "creates a classification file in the base output directory" do
296
+ expect(File.exists?(@test_classification_file)).to be false
297
+ @evaluator.create_testcorpus_classification_file!(@test_classification_file, @ground_truth)
298
+ expect(File.exists?(@test_classification_file)).to be true
299
+ end
300
+
301
+ it "creates a file with an appropriate header" do
302
+ @evaluator.create_testcorpus_classification_file!(@test_classification_file, @ground_truth)
303
+ content = File.open(@test_classification_file, 'r')
304
+
305
+ features = Core::Parser.parse_ARFF(@test_arff_file).enumerate_attributes.to_a.map { |attr| attr.name.upcase }[0...-2]
306
+ proposed_header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *features]
307
+ header = content.lines.first.split(' ')
308
+
309
+ expect(header).to eq proposed_header
310
+ end
311
+
312
+ it "creates a file with an appropriate number of lines" do
313
+ @evaluator.create_testcorpus_classification_file!(@test_classification_file, @ground_truth)
314
+ content = File.open(@test_classification_file, 'r')
315
+
316
+ samples_count = Core::Parser.parse_ARFF(@test_arff_file).n_rows
317
+
318
+ lines = content.lines.to_a
319
+ lines.shift # remove header
320
+ expect(lines.count).to eq samples_count
321
+ end
322
+
323
+ it "has the short class names as class value" do
324
+ @evaluator.create_testcorpus_classification_file!(@test_classification_file, @ground_truth)
325
+ content = File.open(@test_classification_file, 'r')
326
+
327
+ lines = content.lines.to_a
328
+ lines.shift # remove header
329
+ short_classes = Wikipedia::VandalismDetection::Instances::CLASSES_SHORT
330
+ vandalism_index = Wikipedia::VandalismDetection::Instances::VANDALISM_CLASS_INDEX
331
+ regular_index = Wikipedia::VandalismDetection::Instances::REGULAR_CLASS_INDEX
332
+ missing_index = Wikipedia::VandalismDetection::Instances::NOT_KNOWN_INDEX
333
+
334
+ names = [short_classes[regular_index], short_classes[vandalism_index], short_classes[missing_index]]
335
+
336
+ lines.each do |line|
337
+ class_name = line.split[2]
338
+ expect(names).to include class_name
339
+ end
340
+ end
341
+ end
342
+
343
+ describe "#evaluate_testcorpus_classification" do
344
+
345
+ describe "exceptions" do
346
+
347
+ it "raises an GroundTruthFileNotConfiguredError unless a ground thruth file is configured" do
348
+ config = test_config
349
+ config.instance_variable_set :@test_corpus_ground_truth_file, nil
350
+ use_configuration(config)
351
+
352
+ classifier = Wikipedia::VandalismDetection::Classifier.new
353
+ evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
354
+
355
+ expect { evaluator.evaluate_testcorpus_classification }.to raise_error \
356
+ Wikipedia::VandalismDetection::GroundTruthFileNotConfiguredError
357
+ end
358
+
359
+ it "raises an GroundTruthFileNotFoundError unless the ground thruth file can be found" do
360
+ config = test_config
361
+ config.instance_variable_set :@test_corpus_ground_truth_file, 'false-file-name.txt'
362
+ use_configuration(config)
363
+
364
+ classifier = Wikipedia::VandalismDetection::Classifier.new
365
+ evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
366
+
367
+ expect { evaluator.evaluate_testcorpus_classification }.to raise_error \
368
+ Wikipedia::VandalismDetection::GroundTruthFileNotFoundError
369
+ end
370
+ end
371
+
372
+ it "returns a performance values Hash" do
373
+ performance_values = @evaluator.evaluate_testcorpus_classification(sample_count: @sample_count)
374
+ expect(performance_values).to be_a Hash
375
+ end
376
+
377
+ [ :fp_rates,
378
+ :tp_rates,
379
+ :precisions,
380
+ :recalls,
381
+ :pr_auc,
382
+ :roc_auc,
383
+ :total_precision,
384
+ :total_recall
385
+ ].each do |attr|
386
+ it "returns a performance values Hash with property'#{attr}'" do
387
+ performance_values = @evaluator.evaluate_testcorpus_classification(sample_count: @sample_count)
388
+ expect(performance_values[attr]).to_not be_nil
389
+ end
390
+ end
391
+
392
+ it "runs the classification file creation" do
393
+ expect(File.exists?(@test_classification_file)).to be false
394
+ @evaluator.evaluate_testcorpus_classification
395
+ expect(File.exists?(@test_classification_file)).to be true
396
+ end
397
+
398
+ it "overwrites the old classification file" do
399
+ config = test_config
400
+
401
+ config.instance_variable_set(:@features, ['comment length'])
402
+ use_configuration(config)
403
+
404
+ classifier = Wikipedia::VandalismDetection::Classifier.new
405
+ evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
406
+
407
+ evaluator.evaluate_testcorpus_classification
408
+ content_old = File.read(@test_classification_file)
409
+
410
+ config.instance_variable_set(:@features, ['anonymity'])
411
+ use_configuration(config)
412
+
413
+ classifier = Wikipedia::VandalismDetection::Classifier.new
414
+ evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
415
+
416
+ evaluator.evaluate_testcorpus_classification
417
+ content_new = File.read(@test_classification_file)
418
+
419
+ expect(content_old).to_not eq content_new
420
+ end
421
+ end
422
+
423
+ describe "#cross_validate" do
424
+
425
+ it "returns an evaluation object" do
426
+ evaluation = @evaluator.cross_validate
427
+ expect(evaluation.class).to eq Java::WekaClassifiers::Evaluation
428
+ end
429
+
430
+ it "can cross validates the classifier" do
431
+ expect { @evaluator.cross_validate }.not_to raise_error
432
+ end
433
+
434
+ it "can cross validates the classifier with equally distributed samples" do
435
+ expect { @evaluator.cross_validate(equally_distributed: true) }.not_to raise_error
436
+ end
437
+ end
438
+
439
+ describe "#curve_data" do
440
+
441
+ describe "all samples" do
442
+
443
+ before do
444
+ @data = @evaluator.curve_data
445
+ end
446
+
447
+ it "returns a Hash" do
448
+ expect(@data).to be_a Hash
449
+ end
450
+
451
+ it "includes precision curve data" do
452
+ expect(@data[:precision]).to be_an Array
453
+ end
454
+
455
+ it "includes recall curve data" do
456
+ expect(@data[:recall]).to be_an Array
457
+ end
458
+
459
+ it "includes area_under_prc data" do
460
+ expect(@data[:area_under_prc]).to be_a Numeric
461
+ end
462
+
463
+ it "has non-empty :precision Array contents" do
464
+ expect(@data[:precision]).to_not be_empty
465
+ end
466
+
467
+ it "has non-empty :recall Array contents" do
468
+ expect(@data[:recall]).to_not be_empty
469
+ end
470
+ end
471
+
472
+ describe "equally distributed samples" do
473
+
474
+ before do
475
+ @data = @evaluator.curve_data(equally_distributed: true)
476
+ end
477
+
478
+ it "returns a Hash" do
479
+ expect(@data).to be_a Hash
480
+ end
481
+
482
+ it "includes precision curve data" do
483
+ expect(@data[:precision]).to be_a Array
484
+ end
485
+
486
+ it "includes recall curve data" do
487
+ expect(@data[:recall]).to be_a Array
488
+ end
489
+
490
+ it "includes area_under_prc data" do
491
+ expect(@data[:area_under_prc]).to be_a Numeric
492
+ end
493
+
494
+ it "has non-empty :precision Array contents" do
495
+ expect(@data[:precision]).to_not be_empty
496
+ end
497
+
498
+ it "has non-empty :recall Array contents" do
499
+ expect(@data[:recall]).to_not be_empty
500
+ end
501
+ end
502
+ end
503
+
504
+ describe "#feature_analysis" do
505
+
506
+ it "returns a hash" do
507
+ analysis = @evaluator.feature_analysis(sample_count: 100)
508
+ expect(analysis).to be_a Hash
509
+ end
510
+
511
+ it "returns a hash with feature count size" do
512
+ analysis = @evaluator.feature_analysis(sample_count: 100)
513
+ expect(analysis.count).to eq @config.features.count
514
+ end
515
+
516
+ it "returns a hash with sample count number of data hashes" do
517
+ sample_count = 5
518
+ analysis = @evaluator.feature_analysis(sample_count: sample_count)
519
+
520
+ analysis.each do |key, threshold_hash|
521
+ expect(threshold_hash.count).to eq sample_count
522
+ end
523
+ end
524
+
525
+ it "returns the four predictive values in each features threshold hash" do
526
+ analysis = @evaluator.feature_analysis
527
+ threshold_hash = analysis[@config.features.first][0.0]
528
+
529
+ expect(threshold_hash).to have_key(:fp)
530
+ expect(threshold_hash).to have_key(:fn)
531
+ expect(threshold_hash).to have_key(:tp)
532
+ expect(threshold_hash).to have_key(:tn)
533
+ end
534
+ end
535
+
536
+ describe "#full_analysis" do
537
+
538
+ it "returns a hash" do
539
+ analysis = @evaluator.full_analysis(sample_count: 100)
540
+ expect(analysis).to be_a Hash
541
+ end
542
+
543
+ it "returns a hash with smaple count number of threshold hashes" do
544
+ sample_count = 5
545
+ analysis = @evaluator.full_analysis(sample_count: sample_count)
546
+ expect(analysis.count).to eq sample_count
547
+ end
548
+
549
+ it "returns the four predictive values in each features threshold hash" do
550
+ analysis = @evaluator.full_analysis
551
+ threshold_hash = analysis[0.0]
552
+
553
+ expect(threshold_hash).to have_key(:fp)
554
+ expect(threshold_hash).to have_key(:fn)
555
+ expect(threshold_hash).to have_key(:tp)
556
+ expect(threshold_hash).to have_key(:tn)
557
+ end
558
+ end
559
+
560
+ describe "#true_positive?" do
561
+ before do
562
+ @vandalism = Wikipedia::VandalismDetection::Instances::VANDALISM_SHORT
563
+ @regular = Wikipedia::VandalismDetection::Instances::REGULAR_SHORT
564
+ @threshold = 0.7
565
+ end
566
+
567
+ it "returns true if the given confidence is higher than a threshold regarding the ground truth 'V'" do
568
+ expect(Wikipedia::VandalismDetection::Evaluator.true_positive?(@vandalism, 0.8, @threshold)).to be true
569
+ end
570
+
571
+ it "returns false if the given confidence is lower than a threshold regarding the ground truth 'V'" do
572
+ expect(Wikipedia::VandalismDetection::Evaluator.true_positive?(@vandalism, 0.5, @threshold)).to be false
573
+ end
574
+
575
+ it "returns false for the same confidence and threshold if ground truth is 'V'" do
576
+ expect(Wikipedia::VandalismDetection::Evaluator.true_positive?(@vandalism, @threshold, @threshold)).to be false
577
+ end
578
+
579
+ it "returns false if the given confidence is higher than a threshold regarding the ground truth 'R'" do
580
+ expect( Wikipedia::VandalismDetection::Evaluator.true_positive?(@regular, 0.8, @threshold)).to be false
581
+ end
582
+
583
+ it "returns false if the given confidence is lower than a threshold regarding the ground truth 'R'" do
584
+ expect(Wikipedia::VandalismDetection::Evaluator.true_positive?(@regular, 0.5, @threshold)).to be false
585
+ end
586
+ end
587
+
588
+ describe "#true_negative?" do
589
+ before do
590
+ @vandalism = Wikipedia::VandalismDetection::Instances::VANDALISM_SHORT
591
+ @regular = Wikipedia::VandalismDetection::Instances::REGULAR_SHORT
592
+ @threshold = 0.7
593
+ end
594
+
595
+ it "returns true if the given confidence is lower than a threshold regarding the ground truth 'R'" do
596
+ expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@regular, 0.5, @threshold)).to be true
597
+ end
598
+
599
+ it "returns false if the given confidence is higher than a threshold regarding the ground truth 'R'" do
600
+ expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@regular, 0.8, @threshold)).to be false
601
+ end
602
+
603
+ it "returns false for the same confidence and threshold if ground truth is 'R'" do
604
+ expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@regular, @threshold, @threshold)).to be false
605
+ end
606
+
607
+ it "returns false if the given confidence is lower than a threshold regarding the ground truth 'V'" do
608
+ expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@vandalism, 0.5, @threshold)).to be false
609
+ end
610
+
611
+ it "returns false if the given confidence is higher than a threshold regarding the ground truth 'V'" do
612
+ expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@vandalism, 0.8, @threshold)).to be false
613
+ end
614
+ end
615
+
616
+ describe "#false_positive?" do
617
+ before do
618
+ @vandalism = Wikipedia::VandalismDetection::Instances::VANDALISM_SHORT
619
+ @regular = Wikipedia::VandalismDetection::Instances::REGULAR_SHORT
620
+ @threshold = 0.7
621
+ end
622
+
623
+ it "returns true if the given confidence is higher than a threshold regarding the ground truth 'R'" do
624
+ expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@regular, 0.8, @threshold)).to be true
625
+ end
626
+
627
+ it "returns false if the given confidence is lower than a threshold regarding the ground truth 'R'" do
628
+ expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@regular, 0.5, @threshold)).to be false
629
+ end
630
+
631
+ it "returns true for the same confidence and threshold if ground truth is 'R'" do
632
+ expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@regular, @threshold, @threshold)).to be true
633
+ end
634
+
635
+ it "returns false if the given confidence is higher than a threshold regarding the ground truth 'V'" do
636
+ expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@vandalism, 0.8, @threshold)).to be false
637
+ end
638
+
639
+ it "returns false if the given confidence is lower than a threshold regarding the ground truth 'V'" do
640
+ expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@vandalism, 0.5, @threshold)).to be false
641
+ end
642
+ end
643
+
644
+ describe "#false_negative?" do
645
+ before do
646
+ @vandalism = Wikipedia::VandalismDetection::Instances::VANDALISM_SHORT
647
+ @regular = Wikipedia::VandalismDetection::Instances::REGULAR_SHORT
648
+ @threshold = 0.7
649
+ end
650
+
651
+ it "returns true if the given confidence is lower than a threshold regarding the ground truth 'V'" do
652
+ expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@vandalism, 0.5, @threshold)).to be true
653
+ end
654
+
655
+ it "returns false if the given confidence is higher than a threshold regarding the ground truth 'V'" do
656
+ expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@vandalism, 0.8, @threshold)).to be false
657
+ end
658
+
659
+ it "returns true for the same confidence and threshold if ground truth is 'V'" do
660
+ expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@vandalism, @threshold, @threshold)).to be true
661
+ end
662
+
663
+ it "returns false if the given confidence is lower than a threshold regarding the ground truth 'R'" do
664
+ expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@regular, 0.5, @threshold)).to be false
665
+ end
666
+
667
+ it "returns false if the given confidence is higher than a threshold regarding the ground truth 'R'" do
668
+ expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@regular, 0.8, @threshold)).to be false
669
+ end
670
+ end
671
+ end