wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,711 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Evaluator do
4
+ let(:vandalism) { Instances::VANDALISM_SHORT }
5
+ let(:regular) { Instances::REGULAR_SHORT }
6
+
7
+ let(:sample_count) { 10 }
8
+
9
+ before do
10
+ use_test_configuration
11
+ @config = test_config
12
+
13
+ @build_dir = @config.output_base_directory
14
+ @test_arff_file = @config.test_output_arff_file
15
+ @training_arff_file = @config.training_output_arff_file
16
+ @test_classification_file = @config.test_output_classification_file
17
+ end
18
+
19
+ after do
20
+ # remove training arff file
21
+ if File.exist?(@training_arff_file)
22
+ File.delete(@training_arff_file)
23
+ directory = File.dirname(@training_arff_file)
24
+ FileUtils.rm_r(directory)
25
+ end
26
+
27
+ # remove test arff file
28
+ if File.exist?(@test_arff_file)
29
+ File.delete(@test_arff_file)
30
+ directory = File.dirname(@test_arff_file)
31
+ FileUtils.rm_r(directory)
32
+ end
33
+
34
+ # remove classification.txt
35
+ if File.exist?(@test_classification_file)
36
+ File.delete(@test_classification_file)
37
+ directory = File.dirname(@test_classification_file)
38
+ File.rm_r(directory)
39
+ end
40
+
41
+ # remove output base directory
42
+ FileUtils.rm_r(@build_dir) if Dir.exist?(@build_dir)
43
+ end
44
+
45
+ describe '#initialize' do
46
+ it 'raises an ArgumentError if classifier argument is not a Classfier' do
47
+ expect { Evaluator.new('') }.to raise_error ArgumentError
48
+ end
49
+
50
+ it 'does not raise an error when a classifier is passed' do
51
+ classifier = Classifier.new
52
+ expect { Evaluator.new(classifier) }.not_to raise_error ArgumentError
53
+ end
54
+ end
55
+
56
+ let(:classifier) { Classifier.new }
57
+ let(:evaluator) { Evaluator.new(classifier) }
58
+
59
+ describe '#test_performance_curves' do
60
+ let(:classification) do
61
+ {
62
+ '1-2': {
63
+ old_revision_id: 1,
64
+ new_revision_id: 2,
65
+ class: 'R',
66
+ confidence: 0.0
67
+ },
68
+ '2-3': {
69
+ old_revision_id: 2,
70
+ new_revision_id: 3,
71
+ class: 'R',
72
+ confidence: 0.3
73
+ },
74
+ '3-4': {
75
+ old_revision_id: 3,
76
+ new_revision_id: 4,
77
+ class: 'V',
78
+ confidence: 0.8
79
+ },
80
+ '4-5': {
81
+ old_revision_id: 4,
82
+ new_revision_id: 5,
83
+ class: 'V',
84
+ confidence: 1.0
85
+ }
86
+ }
87
+ end
88
+
89
+ # ground truth has one sample more to represent fall-out samples while
90
+ # feature calculation (e.g. redirects are not considered)
91
+ let(:ground_truth) do
92
+ {
93
+ '0-1': { # this is a sample that is not used!
94
+ old_revision_id: 0,
95
+ new_revision_id: 1,
96
+ class: 'R'
97
+ },
98
+ '1-2': {
99
+ old_revision_id: 1,
100
+ new_revision_id: 2,
101
+ class: 'R'
102
+ },
103
+ '2-3': {
104
+ old_revision_id: 2,
105
+ new_revision_id: 3,
106
+ class: 'V'
107
+ },
108
+ '3-4': {
109
+ old_revision_id: 3,
110
+ new_revision_id: 4,
111
+ class: 'R'
112
+ },
113
+ '4-5': {
114
+ old_revision_id: 4,
115
+ new_revision_id: 5,
116
+ class: 'V'
117
+ }
118
+ }
119
+ end
120
+
121
+ let(:curve_data) do
122
+ evaluator.test_performance_curves(
123
+ ground_truth,
124
+ classification,
125
+ sample_count
126
+ )
127
+ end
128
+
129
+ it 'returns a Hash' do
130
+ expect(curve_data).to be_a Hash
131
+ end
132
+
133
+ %i[
134
+ recalls
135
+ precisions
136
+ fp_rates
137
+ tp_rates
138
+ pr_auc
139
+ roc_auc
140
+ ].each do |attribute|
141
+ it "returns a Hash including #{attribute}" do
142
+ expect(curve_data).to have_key(attribute)
143
+ end
144
+ end
145
+
146
+ describe '#predictive_values' do
147
+ let(:threshold) { 0.5 }
148
+ let(:predictive_values) do
149
+ evaluator.predictive_values(ground_truth, classification, threshold)
150
+ end
151
+
152
+ it 'returns a Hash' do
153
+ expect(predictive_values).to be_a Hash
154
+ end
155
+
156
+ [
157
+ { threshold: 0.0, result: { tp: 2, fp: 2, tn: 0, fn: 0 } },
158
+ { threshold: 0.3, result: { tp: 1, fp: 1, tn: 1, fn: 1 } },
159
+ { threshold: 0.5, result: { tp: 1, fp: 1, tn: 1, fn: 1 } },
160
+ { threshold: 0.8, result: { tp: 1, fp: 1, tn: 1, fn: 1 } },
161
+ { threshold: 0.9, result: { tp: 1, fp: 0, tn: 2, fn: 1 } },
162
+ { threshold: 1.0, result: { tp: 0, fp: 0, tn: 2, fn: 2 } }
163
+ ].each do |values|
164
+ it "returns the right values for threshold #{values[:threshold]}" do
165
+ predictive_values = evaluator.predictive_values(
166
+ ground_truth,
167
+ classification,
168
+ values[:threshold]
169
+ )
170
+
171
+ expect(predictive_values).to eq values[:result]
172
+ end
173
+ end
174
+ end
175
+
176
+ describe '#sort_curve_values' do
177
+ let(:x) { [0.7, 0.4, 0.8, 0.4, 0.7] }
178
+ let(:y) { [0.6, 0.8, 0.2, 0.6, 0.6] }
179
+
180
+ let(:x_sorted) { [0.4, 0.4, 0.7, 0.8] }
181
+ let(:y_sorted) { [0.8, 0.6, 0.6, 0.2] }
182
+
183
+ it 'returns the unique sorted input values' do
184
+ hash = { x: x_sorted, y: y_sorted }
185
+ sorted = evaluator.sort_curve_values(x, y)
186
+
187
+ expect(sorted).to eq hash
188
+ end
189
+
190
+ it 'adds start values if given' do
191
+ start_values = { x: -1.0, y: -2.0 }
192
+ hash = {
193
+ x: x_sorted.unshift(start_values[:x]),
194
+ y: y_sorted.unshift(start_values[:y])
195
+ }
196
+
197
+ sorted = evaluator.sort_curve_values(x, y, start_values)
198
+
199
+ expect(sorted).to eq hash
200
+ end
201
+
202
+ it 'adds x start value if only one value given' do
203
+ start_values = { x: -1.0 }
204
+ hash = {
205
+ x: x_sorted.unshift(start_values[:x]),
206
+ y: y_sorted.unshift(y_sorted.first)
207
+ }
208
+
209
+ sorted = evaluator.sort_curve_values(x, y, start_values)
210
+
211
+ expect(sorted).to eq hash
212
+ end
213
+
214
+ it 'adds y start value if only one value given' do
215
+ start_values = { y: -2.0 }
216
+ hash = {
217
+ x: x_sorted.unshift(x_sorted.first),
218
+ y: y_sorted.unshift(start_values[:y])
219
+ }
220
+
221
+ sorted = evaluator.sort_curve_values(x, y, start_values)
222
+
223
+ expect(sorted).to eq hash
224
+ end
225
+
226
+ it 'adds end values if given' do
227
+ end_values = { x: -1.0, y: -2.0 }
228
+ hash = {
229
+ x: x_sorted.push(end_values[:x]),
230
+ y: y_sorted.push(end_values[:y])
231
+ }
232
+
233
+ sorted = evaluator.sort_curve_values(x, y, nil, end_values)
234
+
235
+ expect(sorted).to eq hash
236
+ end
237
+
238
+ it 'adds y end values if only one value is given' do
239
+ end_values = { y: -2.0 }
240
+ hash = {
241
+ x: x_sorted.push(x_sorted.last),
242
+ y: y_sorted.push(end_values[:y])
243
+ }
244
+
245
+ sorted = evaluator.sort_curve_values(x, y, nil, end_values)
246
+
247
+ expect(sorted).to eq hash
248
+ end
249
+
250
+ it 'adds x end values if only one value is given' do
251
+ end_values = { x: -1.0 }
252
+ hash = {
253
+ x: x_sorted.push(end_values[:x]),
254
+ y: y_sorted.push(y_sorted.last)
255
+ }
256
+
257
+ sorted = evaluator.sort_curve_values(x, y, nil, end_values)
258
+
259
+ expect(sorted).to eq hash
260
+ end
261
+ end
262
+
263
+ describe '#area_under_curve' do
264
+ let(:pr_auc) do
265
+ evaluator.area_under_curve(
266
+ curve_data[:precisions],
267
+ curve_data[:precisions]
268
+ )
269
+ end
270
+
271
+ let(:roc_auc) do
272
+ evaluator.area_under_curve(
273
+ curve_data[:fp_rates],
274
+ curve_data[:tp_rates]
275
+ )
276
+ end
277
+
278
+ it 'returns a numeric value for pr_auc' do
279
+ expect(pr_auc).to be_a Numeric
280
+ end
281
+
282
+ it 'returns a numeric value between 0.0 & 1.0 for pr_auc' do
283
+ is_between_zero_and_one = (pr_auc >= 0.0 && pr_auc <= 1.0)
284
+ expect(is_between_zero_and_one).to be true
285
+ end
286
+
287
+ it 'returns a numeric value for roc_auc' do
288
+ expect(roc_auc).to be_a Numeric
289
+ end
290
+
291
+ it 'returns a numeric value between 0.0 & 1.0 for roc_auc' do
292
+ is_between_zero_and_one = roc_auc >= 0.0 && roc_auc <= 1.0
293
+ expect(is_between_zero_and_one).to be true
294
+ end
295
+
296
+ it 'returns the right values' do
297
+ x = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
298
+ y = [1.0, 0.8, 0.6, 0.4, 0.2, 0.0]
299
+ auc = 0.5
300
+
301
+ expect(evaluator.area_under_curve(x, y)).to eq auc
302
+ end
303
+ end
304
+ end
305
+
306
+ describe '#create_testcorpus_classification_file!' do
307
+ let(:ground_truth) do
308
+ { # see resources file ground_truth.csv
309
+ '0-1': { # this is a sample that is not used!
310
+ old_revision_id: 0,
311
+ new_revision_id: 1,
312
+ class: 'R'
313
+ },
314
+ '307084144-326873205': {
315
+ old_revision_id: 307_084_144,
316
+ new_revision_id: 326_873_205,
317
+ class: 'R'
318
+ },
319
+ '326471754-326978767': {
320
+ old_revision_id: 326_471_754,
321
+ new_revision_id: 326_978_767,
322
+ class: 'V'
323
+ },
324
+ '328774035-328774110': {
325
+ old_revision_id: 328_774_035,
326
+ new_revision_id: 328_774_110,
327
+ class: 'R'
328
+ }
329
+ }
330
+ end
331
+
332
+ it 'raises an argument error if ground_truth param is nil' do
333
+ expect { evaluator.create_testcorpus_classification_file!('blah', nil) }
334
+ .to raise_error ArgumentError
335
+ end
336
+
337
+ it 'creates a classification file in the base output directory' do
338
+ expect(File.exist?(@test_classification_file)).to be false
339
+ evaluator.create_testcorpus_classification_file!(@test_classification_file, ground_truth)
340
+ expect(File.exist?(@test_classification_file)).to be true
341
+ end
342
+
343
+ it 'creates a file with an appropriate header' do
344
+ evaluator.create_testcorpus_classification_file!(@test_classification_file, ground_truth)
345
+ content = File.open(@test_classification_file, 'r')
346
+
347
+ instances = Weka::Core::Instances.from_arff(@test_arff_file)
348
+ features = instances.attribute_names.map(&:upcase)[0...-2]
349
+ proposed_header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *features]
350
+ header = content.lines.first.split(' ')
351
+
352
+ expect(header).to eq proposed_header
353
+ end
354
+
355
+ it 'creates a file with an appropriate number of lines' do
356
+ evaluator.create_testcorpus_classification_file!(@test_classification_file, ground_truth)
357
+ content = File.open(@test_classification_file, 'r')
358
+
359
+ samples_count = Weka::Core::Instances.from_arff(@test_arff_file).size
360
+
361
+ lines = content.lines.to_a
362
+ lines.shift # remove header
363
+ expect(lines.count).to eq samples_count
364
+ end
365
+
366
+ it 'has the short class names as class value' do
367
+ evaluator.create_testcorpus_classification_file!(@test_classification_file, ground_truth)
368
+ content = File.open(@test_classification_file, 'r')
369
+
370
+ lines = content.lines.to_a
371
+ lines.shift # remove header
372
+ short_classes = Instances::CLASSES_SHORT
373
+ vandalism_index = Instances::VANDALISM_CLASS_INDEX
374
+ regular_index = Instances::REGULAR_CLASS_INDEX
375
+ missing_index = Instances::NOT_KNOWN_INDEX
376
+
377
+ names = [
378
+ short_classes[regular_index],
379
+ short_classes[vandalism_index],
380
+ short_classes[missing_index]
381
+ ]
382
+
383
+ lines.each do |line|
384
+ class_name = line.split[2]
385
+ expect(names).to include class_name
386
+ end
387
+ end
388
+ end
389
+
390
+ describe '#evaluate_testcorpus_classification' do
391
+ describe 'exceptions' do
392
+ it 'raises an error unless a ground thruth file is configured' do
393
+ config = test_config
394
+ config.instance_variable_set :@test_corpus_ground_truth_file, nil
395
+ use_configuration(config)
396
+
397
+ classifier = Classifier.new
398
+ evaluator = Evaluator.new(classifier)
399
+
400
+ expect { evaluator.evaluate_testcorpus_classification }.to raise_error \
401
+ Wikipedia::VandalismDetection::GroundTruthFileNotConfiguredError
402
+ end
403
+
404
+ it 'raises an error unless the ground thruth file can be found' do
405
+ config = test_config
406
+ config.instance_variable_set(:@test_corpus_ground_truth_file, 'false-file-name.txt')
407
+ use_configuration(config)
408
+
409
+ classifier = Classifier.new
410
+ evaluator = Evaluator.new(classifier)
411
+
412
+ expect { evaluator.evaluate_testcorpus_classification }.to raise_error \
413
+ Wikipedia::VandalismDetection::GroundTruthFileNotFoundError
414
+ end
415
+ end
416
+
417
+ it 'returns a performance values Hash' do
418
+ values = evaluator.evaluate_testcorpus_classification(sample_count: sample_count)
419
+ expect(values).to be_a Hash
420
+ end
421
+
422
+ %i[
423
+ fp_rates
424
+ tp_rates
425
+ precisions
426
+ recalls
427
+ pr_auc
428
+ roc_auc
429
+ total_precision
430
+ total_recall
431
+ ].each do |attribute|
432
+ it "returns a performance values Hash with property'#{attribute}'" do
433
+ values = evaluator.evaluate_testcorpus_classification(sample_count: sample_count)
434
+ expect(values[attribute]).to_not be_nil
435
+ end
436
+ end
437
+
438
+ it 'runs the classification file creation' do
439
+ expect(File.exist?(@test_classification_file)).to be false
440
+ evaluator.evaluate_testcorpus_classification
441
+ expect(File.exist?(@test_classification_file)).to be true
442
+ end
443
+
444
+ it 'overwrites the old classification file' do
445
+ config = test_config
446
+
447
+ config.instance_variable_set(:@features, ['comment length'])
448
+ use_configuration(config)
449
+
450
+ classifier = Classifier.new
451
+ evaluator = Evaluator.new(classifier)
452
+
453
+ evaluator.evaluate_testcorpus_classification
454
+ content_old = File.read(@test_classification_file)
455
+
456
+ config.instance_variable_set(:@features, ['anonymity'])
457
+ use_configuration(config)
458
+
459
+ classifier = Classifier.new
460
+ evaluator = Evaluator.new(classifier)
461
+
462
+ evaluator.evaluate_testcorpus_classification
463
+ content_new = File.read(@test_classification_file)
464
+
465
+ expect(content_old).to_not eq content_new
466
+ end
467
+ end
468
+
469
+ describe '#cross_validate' do
470
+ it 'returns an evaluation object' do
471
+ result = evaluator.cross_validate
472
+ expect(result).to be_a Java::WekaClassifiers::Evaluation
473
+ end
474
+
475
+ it 'can cross validates the classifier with equally distributed samples' do
476
+ result = evaluator.cross_validate(equally_distributed: true)
477
+ expect(result).to be_an Array
478
+
479
+ result.each do |item|
480
+ expect(item).to be_a Java::WekaClassifiers::Evaluation
481
+ end
482
+ end
483
+ end
484
+
485
+ describe '#curve_data' do
486
+ describe 'all samples' do
487
+ let(:data) { evaluator.curve_data }
488
+
489
+ it 'returns a Hash' do
490
+ expect(data).to be_a Hash
491
+ end
492
+
493
+ it 'includes precision curve data' do
494
+ expect(data[:precision]).to be_an Array
495
+ end
496
+
497
+ it 'includes recall curve data' do
498
+ expect(data[:recall]).to be_an Array
499
+ end
500
+
501
+ it 'includes area_under_prc data' do
502
+ expect(data[:area_under_prc]).to be_a Numeric
503
+ end
504
+
505
+ it 'has non-empty :precision Array contents' do
506
+ expect(data[:precision]).to_not be_empty
507
+ end
508
+
509
+ it 'has non-empty :recall Array contents' do
510
+ expect(data[:recall]).to_not be_empty
511
+ end
512
+ end
513
+
514
+ describe 'equally distributed samples' do
515
+ let(:data) { evaluator.curve_data(equally_distributed: true) }
516
+
517
+ it 'returns a Hash' do
518
+ expect(data).to be_a Hash
519
+ end
520
+
521
+ it 'includes precision curve data' do
522
+ expect(data[:precision]).to be_a Array
523
+ end
524
+
525
+ it 'includes recall curve data' do
526
+ expect(data[:recall]).to be_a Array
527
+ end
528
+
529
+ it 'includes area_under_prc data' do
530
+ expect(data[:area_under_prc]).to be_a Numeric
531
+ end
532
+
533
+ it 'has non-empty :precision Array contents' do
534
+ expect(data[:precision]).to_not be_empty
535
+ end
536
+
537
+ it 'has non-empty :recall Array contents' do
538
+ expect(data[:recall]).to_not be_empty
539
+ end
540
+ end
541
+ end
542
+
543
+ describe '#feature_analysis' do
544
+ it 'returns a hash' do
545
+ analysis = evaluator.feature_analysis(sample_count: 100)
546
+ expect(analysis).to be_a Hash
547
+ end
548
+
549
+ it 'returns a hash with feature count size' do
550
+ analysis = evaluator.feature_analysis(sample_count: 100)
551
+ expect(analysis.count).to eq @config.features.count
552
+ end
553
+
554
+ it 'returns a hash with sample count number of data hashes' do
555
+ sample_count = 5
556
+ analysis = evaluator.feature_analysis(sample_count: sample_count)
557
+
558
+ analysis.each_value do |threshold_hash|
559
+ expect(threshold_hash.count).to eq sample_count
560
+ end
561
+ end
562
+
563
+ it 'returns the four predictive values in each features threshold hash' do
564
+ analysis = evaluator.feature_analysis
565
+ threshold_hash = analysis[@config.features.first][0.0]
566
+
567
+ expect(threshold_hash).to have_key(:fp)
568
+ expect(threshold_hash).to have_key(:fn)
569
+ expect(threshold_hash).to have_key(:tp)
570
+ expect(threshold_hash).to have_key(:tn)
571
+ end
572
+ end
573
+
574
+ describe '#full_analysis' do
575
+ it 'returns a hash' do
576
+ analysis = evaluator.full_analysis(sample_count: 100)
577
+ expect(analysis).to be_a Hash
578
+ end
579
+
580
+ it 'returns a hash with smaple count number of threshold hashes' do
581
+ sample_count = 5
582
+ analysis = evaluator.full_analysis(sample_count: sample_count)
583
+ expect(analysis.count).to eq sample_count
584
+ end
585
+
586
+ it 'returns the four predictive values in each features threshold hash' do
587
+ analysis = evaluator.full_analysis
588
+ threshold_hash = analysis[0.0]
589
+
590
+ expect(threshold_hash).to have_key(:fp)
591
+ expect(threshold_hash).to have_key(:fn)
592
+ expect(threshold_hash).to have_key(:tp)
593
+ expect(threshold_hash).to have_key(:tn)
594
+ end
595
+ end
596
+
597
+ describe '#true_positive?' do
598
+ let(:threshold) { 0.7 }
599
+
600
+ it 'returns true if confidence > threshold regarding ground truth "V"' do
601
+ true_pos = Evaluator.true_positive?(vandalism, threshold + 0.1, threshold)
602
+ expect(true_pos).to be true
603
+ end
604
+
605
+ it 'returns false if confidence < threshold regarding ground truth "V"' do
606
+ true_pos = Evaluator.true_positive?(vandalism, threshold - 0.2, threshold)
607
+ expect(true_pos).to be false
608
+ end
609
+
610
+ it 'returns false for same confidence & threshold if ground truth is "V"' do
611
+ true_pos = Evaluator.true_positive?(vandalism, threshold, threshold)
612
+ expect(true_pos).to be false
613
+ end
614
+
615
+ it 'returns false if confidence > threshold regarding ground truth "R"' do
616
+ true_pos = Evaluator.true_positive?(regular, threshold + 0.1, threshold)
617
+ expect(true_pos).to be false
618
+ end
619
+
620
+ it 'returns false if confidence < threshold regarding ground truth "R"' do
621
+ true_pos = Evaluator.true_positive?(regular, threshold - 0.1, threshold)
622
+ expect(true_pos).to be false
623
+ end
624
+ end
625
+
626
+ describe '#true_negative?' do
627
+ let(:threshold) { 0.7 }
628
+
629
+ it 'returns true if confidence < threshold regarding ground truth "R"' do
630
+ true_neg = Evaluator.true_negative?(regular, threshold - 0.1, threshold)
631
+ expect(true_neg).to be true
632
+ end
633
+
634
+ it 'returns false if confidence > threshold regarding ground truth "R"' do
635
+ true_neg = Evaluator.true_negative?(regular, threshold + 0.1, threshold)
636
+ expect(true_neg).to be false
637
+ end
638
+
639
+ it 'returns false for same confidence & threshold if ground truth is "R"' do
640
+ true_neg = Evaluator.true_negative?(regular, threshold, threshold)
641
+ expect(true_neg).to be false
642
+ end
643
+
644
+ it 'returns false if confidence < threshold regarding ground truth "V"' do
645
+ true_neg = Evaluator.true_negative?(vandalism, threshold - 0.1, threshold)
646
+ expect(true_neg).to be false
647
+ end
648
+
649
+ it "returns false if confidence > threshold regarding ground truth 'V'" do
650
+ expect(Evaluator.true_negative?(vandalism, 0.8, threshold)).to be false
651
+ end
652
+ end
653
+
654
+ describe '#false_positive?' do
655
+ let(:threshold) { 0.7 }
656
+
657
+ it 'returns true if confidence > threshold regarding ground truth "R"' do
658
+ false_pos = Evaluator.false_positive?(regular, threshold + 0.1, threshold)
659
+ expect(false_pos).to be true
660
+ end
661
+
662
+ it 'returns false if confidence < threshold regarding ground truth "R"' do
663
+ false_pos = Evaluator.false_positive?(regular, threshold - 0.1, threshold)
664
+ expect(false_pos).to be false
665
+ end
666
+
667
+ it 'returns true for same confidence & threshold if ground truth is "R"' do
668
+ false_pos = Evaluator.false_positive?(regular, threshold, threshold)
669
+ expect(false_pos).to be true
670
+ end
671
+
672
+ it 'returns false if confidence > threshold regarding ground truth "V"' do
673
+ false_pos = Evaluator.false_positive?(vandalism, threshold + 0.1, threshold)
674
+ expect(false_pos).to be false
675
+ end
676
+
677
+ it 'returns false if confidence < threshold regarding ground truth "V"' do
678
+ false_pos = Evaluator.false_positive?(vandalism, threshold - 0.1, threshold)
679
+ expect(false_pos).to be false
680
+ end
681
+ end
682
+
683
+ describe '#false_negative?' do
684
+ let(:threshold) { 0.7 }
685
+
686
+ it 'returns true if confidence < threshold regarding ground truth "V"' do
687
+ false_neg = Evaluator.false_negative?(vandalism, threshold - 0.1, threshold)
688
+ expect(false_neg).to be true
689
+ end
690
+
691
+ it 'returns false if confidence > threshold regarding ground truth "V"' do
692
+ false_neg = Evaluator.false_negative?(vandalism, threshold + 0.1, threshold)
693
+ expect(false_neg).to be false
694
+ end
695
+
696
+ it 'returns true for same confidence & threshold if ground truth is "V"' do
697
+ false_neg = Evaluator.false_negative?(vandalism, threshold, threshold)
698
+ expect(false_neg).to be true
699
+ end
700
+
701
+ it 'returns false if confidence < threshold regarding ground truth "R"' do
702
+ false_neg = Evaluator.false_negative?(regular, threshold - 0.1, threshold)
703
+ expect(false_neg).to be false
704
+ end
705
+
706
+ it 'returns false if confidence > threshold regarding ground truth "R"' do
707
+ false_neg = Evaluator.false_negative?(regular, threshold + 0.1, threshold)
708
+ expect(false_neg).to be false
709
+ end
710
+ end
711
+ end