wikipedia-vandalism_detection 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,711 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Evaluator do
4
+ let(:vandalism) { Instances::VANDALISM_SHORT }
5
+ let(:regular) { Instances::REGULAR_SHORT }
6
+
7
+ let(:sample_count) { 10 }
8
+
9
+ before do
10
+ use_test_configuration
11
+ @config = test_config
12
+
13
+ @build_dir = @config.output_base_directory
14
+ @test_arff_file = @config.test_output_arff_file
15
+ @training_arff_file = @config.training_output_arff_file
16
+ @test_classification_file = @config.test_output_classification_file
17
+ end
18
+
19
+ after do
20
+ # remove training arff file
21
+ if File.exist?(@training_arff_file)
22
+ File.delete(@training_arff_file)
23
+ directory = File.dirname(@training_arff_file)
24
+ FileUtils.rm_r(directory)
25
+ end
26
+
27
+ # remove test arff file
28
+ if File.exist?(@test_arff_file)
29
+ File.delete(@test_arff_file)
30
+ directory = File.dirname(@test_arff_file)
31
+ FileUtils.rm_r(directory)
32
+ end
33
+
34
+ # remove classification.txt
35
+ if File.exist?(@test_classification_file)
36
+ File.delete(@test_classification_file)
37
+ directory = File.dirname(@test_classification_file)
38
+ File.rm_r(directory)
39
+ end
40
+
41
+ # remove output base directory
42
+ FileUtils.rm_r(@build_dir) if Dir.exist?(@build_dir)
43
+ end
44
+
45
+ describe '#initialize' do
46
+ it 'raises an ArgumentError if classifier argument is not a Classfier' do
47
+ expect { Evaluator.new('') }.to raise_error ArgumentError
48
+ end
49
+
50
+ it 'does not raise an error when a classifier is passed' do
51
+ classifier = Classifier.new
52
+ expect { Evaluator.new(classifier) }.not_to raise_error ArgumentError
53
+ end
54
+ end
55
+
56
+ let(:classifier) { Classifier.new }
57
+ let(:evaluator) { Evaluator.new(classifier) }
58
+
59
+ describe '#test_performance_curves' do
60
+ let(:classification) do
61
+ {
62
+ '1-2': {
63
+ old_revision_id: 1,
64
+ new_revision_id: 2,
65
+ class: 'R',
66
+ confidence: 0.0
67
+ },
68
+ '2-3': {
69
+ old_revision_id: 2,
70
+ new_revision_id: 3,
71
+ class: 'R',
72
+ confidence: 0.3
73
+ },
74
+ '3-4': {
75
+ old_revision_id: 3,
76
+ new_revision_id: 4,
77
+ class: 'V',
78
+ confidence: 0.8
79
+ },
80
+ '4-5': {
81
+ old_revision_id: 4,
82
+ new_revision_id: 5,
83
+ class: 'V',
84
+ confidence: 1.0
85
+ }
86
+ }
87
+ end
88
+
89
+ # ground truth has one sample more to represent fall-out samples while
90
+ # feature calculation (e.g. redirects are not considered)
91
+ let(:ground_truth) do
92
+ {
93
+ '0-1': { # this is a sample that is not used!
94
+ old_revision_id: 0,
95
+ new_revision_id: 1,
96
+ class: 'R'
97
+ },
98
+ '1-2': {
99
+ old_revision_id: 1,
100
+ new_revision_id: 2,
101
+ class: 'R'
102
+ },
103
+ '2-3': {
104
+ old_revision_id: 2,
105
+ new_revision_id: 3,
106
+ class: 'V'
107
+ },
108
+ '3-4': {
109
+ old_revision_id: 3,
110
+ new_revision_id: 4,
111
+ class: 'R'
112
+ },
113
+ '4-5': {
114
+ old_revision_id: 4,
115
+ new_revision_id: 5,
116
+ class: 'V'
117
+ }
118
+ }
119
+ end
120
+
121
+ let(:curve_data) do
122
+ evaluator.test_performance_curves(
123
+ ground_truth,
124
+ classification,
125
+ sample_count
126
+ )
127
+ end
128
+
129
+ it 'returns a Hash' do
130
+ expect(curve_data).to be_a Hash
131
+ end
132
+
133
+ %i[
134
+ recalls
135
+ precisions
136
+ fp_rates
137
+ tp_rates
138
+ pr_auc
139
+ roc_auc
140
+ ].each do |attribute|
141
+ it "returns a Hash including #{attribute}" do
142
+ expect(curve_data).to have_key(attribute)
143
+ end
144
+ end
145
+
146
+ describe '#predictive_values' do
147
+ let(:threshold) { 0.5 }
148
+ let(:predictive_values) do
149
+ evaluator.predictive_values(ground_truth, classification, threshold)
150
+ end
151
+
152
+ it 'returns a Hash' do
153
+ expect(predictive_values).to be_a Hash
154
+ end
155
+
156
+ [
157
+ { threshold: 0.0, result: { tp: 2, fp: 2, tn: 0, fn: 0 } },
158
+ { threshold: 0.3, result: { tp: 1, fp: 1, tn: 1, fn: 1 } },
159
+ { threshold: 0.5, result: { tp: 1, fp: 1, tn: 1, fn: 1 } },
160
+ { threshold: 0.8, result: { tp: 1, fp: 1, tn: 1, fn: 1 } },
161
+ { threshold: 0.9, result: { tp: 1, fp: 0, tn: 2, fn: 1 } },
162
+ { threshold: 1.0, result: { tp: 0, fp: 0, tn: 2, fn: 2 } }
163
+ ].each do |values|
164
+ it "returns the right values for threshold #{values[:threshold]}" do
165
+ predictive_values = evaluator.predictive_values(
166
+ ground_truth,
167
+ classification,
168
+ values[:threshold]
169
+ )
170
+
171
+ expect(predictive_values).to eq values[:result]
172
+ end
173
+ end
174
+ end
175
+
176
+ describe '#sort_curve_values' do
177
+ let(:x) { [0.7, 0.4, 0.8, 0.4, 0.7] }
178
+ let(:y) { [0.6, 0.8, 0.2, 0.6, 0.6] }
179
+
180
+ let(:x_sorted) { [0.4, 0.4, 0.7, 0.8] }
181
+ let(:y_sorted) { [0.8, 0.6, 0.6, 0.2] }
182
+
183
+ it 'returns the unique sorted input values' do
184
+ hash = { x: x_sorted, y: y_sorted }
185
+ sorted = evaluator.sort_curve_values(x, y)
186
+
187
+ expect(sorted).to eq hash
188
+ end
189
+
190
+ it 'adds start values if given' do
191
+ start_values = { x: -1.0, y: -2.0 }
192
+ hash = {
193
+ x: x_sorted.unshift(start_values[:x]),
194
+ y: y_sorted.unshift(start_values[:y])
195
+ }
196
+
197
+ sorted = evaluator.sort_curve_values(x, y, start_values)
198
+
199
+ expect(sorted).to eq hash
200
+ end
201
+
202
+ it 'adds x start value if only one value given' do
203
+ start_values = { x: -1.0 }
204
+ hash = {
205
+ x: x_sorted.unshift(start_values[:x]),
206
+ y: y_sorted.unshift(y_sorted.first)
207
+ }
208
+
209
+ sorted = evaluator.sort_curve_values(x, y, start_values)
210
+
211
+ expect(sorted).to eq hash
212
+ end
213
+
214
+ it 'adds y start value if only one value given' do
215
+ start_values = { y: -2.0 }
216
+ hash = {
217
+ x: x_sorted.unshift(x_sorted.first),
218
+ y: y_sorted.unshift(start_values[:y])
219
+ }
220
+
221
+ sorted = evaluator.sort_curve_values(x, y, start_values)
222
+
223
+ expect(sorted).to eq hash
224
+ end
225
+
226
+ it 'adds end values if given' do
227
+ end_values = { x: -1.0, y: -2.0 }
228
+ hash = {
229
+ x: x_sorted.push(end_values[:x]),
230
+ y: y_sorted.push(end_values[:y])
231
+ }
232
+
233
+ sorted = evaluator.sort_curve_values(x, y, nil, end_values)
234
+
235
+ expect(sorted).to eq hash
236
+ end
237
+
238
+ it 'adds y end values if only one value is given' do
239
+ end_values = { y: -2.0 }
240
+ hash = {
241
+ x: x_sorted.push(x_sorted.last),
242
+ y: y_sorted.push(end_values[:y])
243
+ }
244
+
245
+ sorted = evaluator.sort_curve_values(x, y, nil, end_values)
246
+
247
+ expect(sorted).to eq hash
248
+ end
249
+
250
+ it 'adds x end values if only one value is given' do
251
+ end_values = { x: -1.0 }
252
+ hash = {
253
+ x: x_sorted.push(end_values[:x]),
254
+ y: y_sorted.push(y_sorted.last)
255
+ }
256
+
257
+ sorted = evaluator.sort_curve_values(x, y, nil, end_values)
258
+
259
+ expect(sorted).to eq hash
260
+ end
261
+ end
262
+
263
+ describe '#area_under_curve' do
264
+ let(:pr_auc) do
265
+ evaluator.area_under_curve(
266
+ curve_data[:precisions],
267
+ curve_data[:precisions]
268
+ )
269
+ end
270
+
271
+ let(:roc_auc) do
272
+ evaluator.area_under_curve(
273
+ curve_data[:fp_rates],
274
+ curve_data[:tp_rates]
275
+ )
276
+ end
277
+
278
+ it 'returns a numeric value for pr_auc' do
279
+ expect(pr_auc).to be_a Numeric
280
+ end
281
+
282
+ it 'returns a numeric value between 0.0 & 1.0 for pr_auc' do
283
+ is_between_zero_and_one = (pr_auc >= 0.0 && pr_auc <= 1.0)
284
+ expect(is_between_zero_and_one).to be true
285
+ end
286
+
287
+ it 'returns a numeric value for roc_auc' do
288
+ expect(roc_auc).to be_a Numeric
289
+ end
290
+
291
+ it 'returns a numeric value between 0.0 & 1.0 for roc_auc' do
292
+ is_between_zero_and_one = roc_auc >= 0.0 && roc_auc <= 1.0
293
+ expect(is_between_zero_and_one).to be true
294
+ end
295
+
296
+ it 'returns the right values' do
297
+ x = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
298
+ y = [1.0, 0.8, 0.6, 0.4, 0.2, 0.0]
299
+ auc = 0.5
300
+
301
+ expect(evaluator.area_under_curve(x, y)).to eq auc
302
+ end
303
+ end
304
+ end
305
+
306
+ describe '#create_testcorpus_classification_file!' do
307
+ let(:ground_truth) do
308
+ { # see resources file ground_truth.csv
309
+ '0-1': { # this is a sample that is not used!
310
+ old_revision_id: 0,
311
+ new_revision_id: 1,
312
+ class: 'R'
313
+ },
314
+ '307084144-326873205': {
315
+ old_revision_id: 307_084_144,
316
+ new_revision_id: 326_873_205,
317
+ class: 'R'
318
+ },
319
+ '326471754-326978767': {
320
+ old_revision_id: 326_471_754,
321
+ new_revision_id: 326_978_767,
322
+ class: 'V'
323
+ },
324
+ '328774035-328774110': {
325
+ old_revision_id: 328_774_035,
326
+ new_revision_id: 328_774_110,
327
+ class: 'R'
328
+ }
329
+ }
330
+ end
331
+
332
+ it 'raises an argument error if ground_truth param is nil' do
333
+ expect { evaluator.create_testcorpus_classification_file!('blah', nil) }
334
+ .to raise_error ArgumentError
335
+ end
336
+
337
+ it 'creates a classification file in the base output directory' do
338
+ expect(File.exist?(@test_classification_file)).to be false
339
+ evaluator.create_testcorpus_classification_file!(@test_classification_file, ground_truth)
340
+ expect(File.exist?(@test_classification_file)).to be true
341
+ end
342
+
343
+ it 'creates a file with an appropriate header' do
344
+ evaluator.create_testcorpus_classification_file!(@test_classification_file, ground_truth)
345
+ content = File.open(@test_classification_file, 'r')
346
+
347
+ instances = Weka::Core::Instances.from_arff(@test_arff_file)
348
+ features = instances.attribute_names.map(&:upcase)[0...-2]
349
+ proposed_header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *features]
350
+ header = content.lines.first.split(' ')
351
+
352
+ expect(header).to eq proposed_header
353
+ end
354
+
355
+ it 'creates a file with an appropriate number of lines' do
356
+ evaluator.create_testcorpus_classification_file!(@test_classification_file, ground_truth)
357
+ content = File.open(@test_classification_file, 'r')
358
+
359
+ samples_count = Weka::Core::Instances.from_arff(@test_arff_file).size
360
+
361
+ lines = content.lines.to_a
362
+ lines.shift # remove header
363
+ expect(lines.count).to eq samples_count
364
+ end
365
+
366
+ it 'has the short class names as class value' do
367
+ evaluator.create_testcorpus_classification_file!(@test_classification_file, ground_truth)
368
+ content = File.open(@test_classification_file, 'r')
369
+
370
+ lines = content.lines.to_a
371
+ lines.shift # remove header
372
+ short_classes = Instances::CLASSES_SHORT
373
+ vandalism_index = Instances::VANDALISM_CLASS_INDEX
374
+ regular_index = Instances::REGULAR_CLASS_INDEX
375
+ missing_index = Instances::NOT_KNOWN_INDEX
376
+
377
+ names = [
378
+ short_classes[regular_index],
379
+ short_classes[vandalism_index],
380
+ short_classes[missing_index]
381
+ ]
382
+
383
+ lines.each do |line|
384
+ class_name = line.split[2]
385
+ expect(names).to include class_name
386
+ end
387
+ end
388
+ end
389
+
390
+ describe '#evaluate_testcorpus_classification' do
391
+ describe 'exceptions' do
392
+ it 'raises an error unless a ground thruth file is configured' do
393
+ config = test_config
394
+ config.instance_variable_set :@test_corpus_ground_truth_file, nil
395
+ use_configuration(config)
396
+
397
+ classifier = Classifier.new
398
+ evaluator = Evaluator.new(classifier)
399
+
400
+ expect { evaluator.evaluate_testcorpus_classification }.to raise_error \
401
+ Wikipedia::VandalismDetection::GroundTruthFileNotConfiguredError
402
+ end
403
+
404
+ it 'raises an error unless the ground thruth file can be found' do
405
+ config = test_config
406
+ config.instance_variable_set(:@test_corpus_ground_truth_file, 'false-file-name.txt')
407
+ use_configuration(config)
408
+
409
+ classifier = Classifier.new
410
+ evaluator = Evaluator.new(classifier)
411
+
412
+ expect { evaluator.evaluate_testcorpus_classification }.to raise_error \
413
+ Wikipedia::VandalismDetection::GroundTruthFileNotFoundError
414
+ end
415
+ end
416
+
417
+ it 'returns a performance values Hash' do
418
+ values = evaluator.evaluate_testcorpus_classification(sample_count: sample_count)
419
+ expect(values).to be_a Hash
420
+ end
421
+
422
+ %i[
423
+ fp_rates
424
+ tp_rates
425
+ precisions
426
+ recalls
427
+ pr_auc
428
+ roc_auc
429
+ total_precision
430
+ total_recall
431
+ ].each do |attribute|
432
+ it "returns a performance values Hash with property'#{attribute}'" do
433
+ values = evaluator.evaluate_testcorpus_classification(sample_count: sample_count)
434
+ expect(values[attribute]).to_not be_nil
435
+ end
436
+ end
437
+
438
+ it 'runs the classification file creation' do
439
+ expect(File.exist?(@test_classification_file)).to be false
440
+ evaluator.evaluate_testcorpus_classification
441
+ expect(File.exist?(@test_classification_file)).to be true
442
+ end
443
+
444
+ it 'overwrites the old classification file' do
445
+ config = test_config
446
+
447
+ config.instance_variable_set(:@features, ['comment length'])
448
+ use_configuration(config)
449
+
450
+ classifier = Classifier.new
451
+ evaluator = Evaluator.new(classifier)
452
+
453
+ evaluator.evaluate_testcorpus_classification
454
+ content_old = File.read(@test_classification_file)
455
+
456
+ config.instance_variable_set(:@features, ['anonymity'])
457
+ use_configuration(config)
458
+
459
+ classifier = Classifier.new
460
+ evaluator = Evaluator.new(classifier)
461
+
462
+ evaluator.evaluate_testcorpus_classification
463
+ content_new = File.read(@test_classification_file)
464
+
465
+ expect(content_old).to_not eq content_new
466
+ end
467
+ end
468
+
469
+ describe '#cross_validate' do
470
+ it 'returns an evaluation object' do
471
+ result = evaluator.cross_validate
472
+ expect(result).to be_a Java::WekaClassifiers::Evaluation
473
+ end
474
+
475
+ it 'can cross validates the classifier with equally distributed samples' do
476
+ result = evaluator.cross_validate(equally_distributed: true)
477
+ expect(result).to be_an Array
478
+
479
+ result.each do |item|
480
+ expect(item).to be_a Java::WekaClassifiers::Evaluation
481
+ end
482
+ end
483
+ end
484
+
485
+ describe '#curve_data' do
486
+ describe 'all samples' do
487
+ let(:data) { evaluator.curve_data }
488
+
489
+ it 'returns a Hash' do
490
+ expect(data).to be_a Hash
491
+ end
492
+
493
+ it 'includes precision curve data' do
494
+ expect(data[:precision]).to be_an Array
495
+ end
496
+
497
+ it 'includes recall curve data' do
498
+ expect(data[:recall]).to be_an Array
499
+ end
500
+
501
+ it 'includes area_under_prc data' do
502
+ expect(data[:area_under_prc]).to be_a Numeric
503
+ end
504
+
505
+ it 'has non-empty :precision Array contents' do
506
+ expect(data[:precision]).to_not be_empty
507
+ end
508
+
509
+ it 'has non-empty :recall Array contents' do
510
+ expect(data[:recall]).to_not be_empty
511
+ end
512
+ end
513
+
514
+ describe 'equally distributed samples' do
515
+ let(:data) { evaluator.curve_data(equally_distributed: true) }
516
+
517
+ it 'returns a Hash' do
518
+ expect(data).to be_a Hash
519
+ end
520
+
521
+ it 'includes precision curve data' do
522
+ expect(data[:precision]).to be_a Array
523
+ end
524
+
525
+ it 'includes recall curve data' do
526
+ expect(data[:recall]).to be_a Array
527
+ end
528
+
529
+ it 'includes area_under_prc data' do
530
+ expect(data[:area_under_prc]).to be_a Numeric
531
+ end
532
+
533
+ it 'has non-empty :precision Array contents' do
534
+ expect(data[:precision]).to_not be_empty
535
+ end
536
+
537
+ it 'has non-empty :recall Array contents' do
538
+ expect(data[:recall]).to_not be_empty
539
+ end
540
+ end
541
+ end
542
+
543
+ describe '#feature_analysis' do
544
+ it 'returns a hash' do
545
+ analysis = evaluator.feature_analysis(sample_count: 100)
546
+ expect(analysis).to be_a Hash
547
+ end
548
+
549
+ it 'returns a hash with feature count size' do
550
+ analysis = evaluator.feature_analysis(sample_count: 100)
551
+ expect(analysis.count).to eq @config.features.count
552
+ end
553
+
554
+ it 'returns a hash with sample count number of data hashes' do
555
+ sample_count = 5
556
+ analysis = evaluator.feature_analysis(sample_count: sample_count)
557
+
558
+ analysis.each_value do |threshold_hash|
559
+ expect(threshold_hash.count).to eq sample_count
560
+ end
561
+ end
562
+
563
+ it 'returns the four predictive values in each features threshold hash' do
564
+ analysis = evaluator.feature_analysis
565
+ threshold_hash = analysis[@config.features.first][0.0]
566
+
567
+ expect(threshold_hash).to have_key(:fp)
568
+ expect(threshold_hash).to have_key(:fn)
569
+ expect(threshold_hash).to have_key(:tp)
570
+ expect(threshold_hash).to have_key(:tn)
571
+ end
572
+ end
573
+
574
+ describe '#full_analysis' do
575
+ it 'returns a hash' do
576
+ analysis = evaluator.full_analysis(sample_count: 100)
577
+ expect(analysis).to be_a Hash
578
+ end
579
+
580
+ it 'returns a hash with smaple count number of threshold hashes' do
581
+ sample_count = 5
582
+ analysis = evaluator.full_analysis(sample_count: sample_count)
583
+ expect(analysis.count).to eq sample_count
584
+ end
585
+
586
+ it 'returns the four predictive values in each features threshold hash' do
587
+ analysis = evaluator.full_analysis
588
+ threshold_hash = analysis[0.0]
589
+
590
+ expect(threshold_hash).to have_key(:fp)
591
+ expect(threshold_hash).to have_key(:fn)
592
+ expect(threshold_hash).to have_key(:tp)
593
+ expect(threshold_hash).to have_key(:tn)
594
+ end
595
+ end
596
+
597
+ describe '#true_positive?' do
598
+ let(:threshold) { 0.7 }
599
+
600
+ it 'returns true if confidence > threshold regarding ground truth "V"' do
601
+ true_pos = Evaluator.true_positive?(vandalism, threshold + 0.1, threshold)
602
+ expect(true_pos).to be true
603
+ end
604
+
605
+ it 'returns false if confidence < threshold regarding ground truth "V"' do
606
+ true_pos = Evaluator.true_positive?(vandalism, threshold - 0.2, threshold)
607
+ expect(true_pos).to be false
608
+ end
609
+
610
+ it 'returns false for same confidence & threshold if ground truth is "V"' do
611
+ true_pos = Evaluator.true_positive?(vandalism, threshold, threshold)
612
+ expect(true_pos).to be false
613
+ end
614
+
615
+ it 'returns false if confidence > threshold regarding ground truth "R"' do
616
+ true_pos = Evaluator.true_positive?(regular, threshold + 0.1, threshold)
617
+ expect(true_pos).to be false
618
+ end
619
+
620
+ it 'returns false if confidence < threshold regarding ground truth "R"' do
621
+ true_pos = Evaluator.true_positive?(regular, threshold - 0.1, threshold)
622
+ expect(true_pos).to be false
623
+ end
624
+ end
625
+
626
+ describe '#true_negative?' do
627
+ let(:threshold) { 0.7 }
628
+
629
+ it 'returns true if confidence < threshold regarding ground truth "R"' do
630
+ true_neg = Evaluator.true_negative?(regular, threshold - 0.1, threshold)
631
+ expect(true_neg).to be true
632
+ end
633
+
634
+ it 'returns false if confidence > threshold regarding ground truth "R"' do
635
+ true_neg = Evaluator.true_negative?(regular, threshold + 0.1, threshold)
636
+ expect(true_neg).to be false
637
+ end
638
+
639
+ it 'returns false for same confidence & threshold if ground truth is "R"' do
640
+ true_neg = Evaluator.true_negative?(regular, threshold, threshold)
641
+ expect(true_neg).to be false
642
+ end
643
+
644
+ it 'returns false if confidence < threshold regarding ground truth "V"' do
645
+ true_neg = Evaluator.true_negative?(vandalism, threshold - 0.1, threshold)
646
+ expect(true_neg).to be false
647
+ end
648
+
649
+ it "returns false if confidence > threshold regarding ground truth 'V'" do
650
+ expect(Evaluator.true_negative?(vandalism, 0.8, threshold)).to be false
651
+ end
652
+ end
653
+
654
+ describe '#false_positive?' do
655
+ let(:threshold) { 0.7 }
656
+
657
+ it 'returns true if confidence > threshold regarding ground truth "R"' do
658
+ false_pos = Evaluator.false_positive?(regular, threshold + 0.1, threshold)
659
+ expect(false_pos).to be true
660
+ end
661
+
662
+ it 'returns false if confidence < threshold regarding ground truth "R"' do
663
+ false_pos = Evaluator.false_positive?(regular, threshold - 0.1, threshold)
664
+ expect(false_pos).to be false
665
+ end
666
+
667
+ it 'returns true for same confidence & threshold if ground truth is "R"' do
668
+ false_pos = Evaluator.false_positive?(regular, threshold, threshold)
669
+ expect(false_pos).to be true
670
+ end
671
+
672
+ it 'returns false if confidence > threshold regarding ground truth "V"' do
673
+ false_pos = Evaluator.false_positive?(vandalism, threshold + 0.1, threshold)
674
+ expect(false_pos).to be false
675
+ end
676
+
677
+ it 'returns false if confidence < threshold regarding ground truth "V"' do
678
+ false_pos = Evaluator.false_positive?(vandalism, threshold - 0.1, threshold)
679
+ expect(false_pos).to be false
680
+ end
681
+ end
682
+
683
+ describe '#false_negative?' do
684
+ let(:threshold) { 0.7 }
685
+
686
+ it 'returns true if confidence < threshold regarding ground truth "V"' do
687
+ false_neg = Evaluator.false_negative?(vandalism, threshold - 0.1, threshold)
688
+ expect(false_neg).to be true
689
+ end
690
+
691
+ it 'returns false if confidence > threshold regarding ground truth "V"' do
692
+ false_neg = Evaluator.false_negative?(vandalism, threshold + 0.1, threshold)
693
+ expect(false_neg).to be false
694
+ end
695
+
696
+ it 'returns true for same confidence & threshold if ground truth is "V"' do
697
+ false_neg = Evaluator.false_negative?(vandalism, threshold, threshold)
698
+ expect(false_neg).to be true
699
+ end
700
+
701
+ it 'returns false if confidence < threshold regarding ground truth "R"' do
702
+ false_neg = Evaluator.false_negative?(regular, threshold - 0.1, threshold)
703
+ expect(false_neg).to be false
704
+ end
705
+
706
+ it 'returns false if confidence > threshold regarding ground truth "R"' do
707
+ false_neg = Evaluator.false_negative?(regular, threshold + 0.1, threshold)
708
+ expect(false_neg).to be false
709
+ end
710
+ end
711
+ end