wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,330 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Classifier do
4
+ before do
5
+ use_test_configuration
6
+ @config = test_config
7
+ end
8
+
9
+ after do
10
+ arff_file = @config.training_output_arff_file
11
+ build_dir = @config.output_base_directory
12
+
13
+ if File.exist?(arff_file)
14
+ File.delete(arff_file)
15
+ directory = File.dirname(arff_file)
16
+ FileUtils.rm_r(directory)
17
+ end
18
+
19
+ FileUtils.rm_r(build_dir) if Dir.exist?(build_dir)
20
+ end
21
+
22
+ it 'loads the configured classifier while instanciating' do
23
+ classifier_name = @config.classifier_type
24
+ class_type = "Weka::Classifiers::#{classifier_name}".constantize
25
+
26
+ expect(subject.classifier_instance).to be_a class_type
27
+ end
28
+
29
+ it 'loads the configured classifier with given dataset' do
30
+ classifier_name = @config.classifier_type
31
+ class_type = "Weka::Classifiers::#{classifier_name}".constantize
32
+ dataset = Instances.empty_for_feature('anonymity')
33
+ dataset.add_instance([1.0, Instances::REGULAR])
34
+
35
+ classifier = Classifier.new(dataset)
36
+
37
+ expect(classifier.classifier_instance).to be_a class_type
38
+ expect(classifier.dataset).to be dataset
39
+ end
40
+
41
+ it 'raises an error if no classifier is configured' do
42
+ config = test_config
43
+ config.instance_variable_set(:@classifier_type, nil)
44
+ use_configuration(config)
45
+
46
+ expect { Classifier.new }.to raise_error \
47
+ Wikipedia::VandalismDetection::ClassifierNotConfiguredError
48
+ end
49
+
50
+ it 'raises an error if an unknown classifier is configured' do
51
+ config = test_config
52
+ config.instance_variable_set(:@classifier_type, 'Unknown Classifier')
53
+ use_configuration(config)
54
+
55
+ expect { Classifier.new }.to raise_error \
56
+ Wikipedia::VandalismDetection::ClassifierUnknownError
57
+ end
58
+
59
+ it 'raises an error if no features are configured' do
60
+ config = test_config
61
+ config.instance_variable_set :@features, []
62
+ use_configuration(config)
63
+
64
+ expect { Classifier.new }.to raise_error \
65
+ Wikipedia::VandalismDetection::FeaturesNotConfiguredError
66
+ end
67
+
68
+ it 'loads & trains the classifier with balanced dataset if configured' do
69
+ config = test_config
70
+ config.instance_variable_set(:@training_data_options, 'balanced')
71
+ use_configuration(config)
72
+
73
+ classifier = Classifier.new
74
+
75
+ # 2 vandalism, 2 regular, see resources/corpora/training/annotations.csv
76
+ expect(classifier.dataset.size).to eq 4
77
+ end
78
+
79
+ it 'loads & trains the classifier with unbalanced dataset if configured' do
80
+ config = test_config
81
+ config.instance_variable_set(:@training_data_options, 'unbalanced')
82
+ use_configuration(config)
83
+
84
+ classifier = Classifier.new
85
+ dataset = classifier.dataset
86
+
87
+ vandalism_class_index = Instances::VANDALISM_CLASS_INDEX
88
+ regular_class_index = Instances::REGULAR_CLASS_INDEX
89
+
90
+ vandalism_count = dataset.instances.reduce(0) do |count, instance|
91
+ count += 1 if instance.class_value.to_i == vandalism_class_index
92
+ count
93
+ end
94
+
95
+ regular_count = dataset.instances.reduce(0) do |count, instance|
96
+ count += 1 if instance.class_value.to_i == regular_class_index
97
+ count
98
+ end
99
+
100
+ # 2 vandalism, 4 regular, see resources/corpora/training/annotations.csv
101
+ expect(dataset.size).to eq 6
102
+ expect(regular_count).to eq 4
103
+ expect(vandalism_count).to eq 2
104
+ end
105
+
106
+ it 'loads & trains the classifier with oversampled dataset if configured' do
107
+ config = test_config
108
+ config.instance_variable_set(:@training_data_options, 'oversampled')
109
+ use_configuration(config)
110
+
111
+ classifier = Classifier.new
112
+ dataset = classifier.dataset
113
+
114
+ vandalism_class_index = Instances::VANDALISM_CLASS_INDEX
115
+ regular_class_index = Instances::REGULAR_CLASS_INDEX
116
+
117
+ vandalism_count = dataset.instances.reduce(0) do |count, instance|
118
+ count += 1 if instance.class_value.to_i == vandalism_class_index
119
+ count
120
+ end
121
+
122
+ regular_count = dataset.instances.reduce(0) do |count, instance|
123
+ count += 1 if instance.class_value.to_i == regular_class_index
124
+ count
125
+ end
126
+
127
+ # 4 vandalism, 4 regular, due to SMOTE oversampling
128
+ expect(dataset.size).to eq 8
129
+ expect(regular_count).to eq 4
130
+ expect(vandalism_count).to eq 4
131
+ end
132
+
133
+ it 'loads & trains the classifier with customized oversampled dataset if configured' do
134
+ config = test_config
135
+ options = 'oversampled -p 200 -u false'
136
+ config.instance_variable_set(:@training_data_options, options)
137
+ use_configuration(config)
138
+
139
+ classifier = Classifier.new
140
+ dataset = classifier.dataset
141
+
142
+ vandalism_class_index = Instances::VANDALISM_CLASS_INDEX
143
+ regular_class_index = Instances::REGULAR_CLASS_INDEX
144
+
145
+ vandalism_count = dataset.instances.reduce(0) do |count, instance|
146
+ count += 1 if instance.class_value.to_i == vandalism_class_index
147
+ count
148
+ end
149
+
150
+ regular_count = dataset.instances.reduce(0) do |count, instance|
151
+ count += 1 if instance.class_value.to_i == regular_class_index
152
+ count
153
+ end
154
+
155
+ # 2 + 200 % = 6 vandalism, 4 regular, due to SMOTE oversampling without
156
+ # undersampling
157
+ expect(dataset.size).to eq 10
158
+ expect(regular_count).to eq 4
159
+ expect(vandalism_count).to eq 6
160
+ end
161
+
162
+ describe 'attribute readers' do
163
+ %i[classifier_instance evaluator dataset].each do |attribute|
164
+ it "has a readable #{name} attribute" do
165
+ expect(subject).to respond_to attribute
166
+ end
167
+ end
168
+
169
+ it 'returns an Evaluator instance from attribute #evaluator' do
170
+ expect(subject.evaluator).to be_an Evaluator
171
+ end
172
+ end
173
+
174
+ describe '#classify' do
175
+ let(:edit) { build(:edit) }
176
+
177
+ let(:features) do
178
+ calculator = Wikipedia::VandalismDetection::FeatureCalculator.new
179
+ calculator.calculate_features_for(edit)
180
+ end
181
+
182
+ it 'raises an error if the argument is no Edit or feature Array' do
183
+ expect { subject.classify('data') }.to raise_error ArgumentError
184
+ end
185
+
186
+ it 'takes an Edit as argument' do
187
+ expect { subject.classify(edit) }.not_to raise_error ArgumentError
188
+ end
189
+
190
+ it 'takes a feature Array as argument' do
191
+ expect { subject.classify(features) }.not_to raise_error ArgumentError
192
+ end
193
+
194
+ it 'returns the same value for both edit and features as argument' do
195
+ confidence_from_edit = subject.classify(edit)
196
+ confidence_from_features = subject.classify(features)
197
+
198
+ expect(confidence_from_edit).to eq confidence_from_features
199
+ end
200
+
201
+ it 'returns a Numeric value as the confidence of vandalism class' do
202
+ confidence = subject.classify(features)
203
+ expect(confidence).to be_a Numeric
204
+ end
205
+
206
+ it 'returns a confidence between 0.0 and 1.0' do
207
+ confidence = subject.classify(features)
208
+ is_between_zero_and_one = confidence <= 1.0 && confidence >= 0.0
209
+ expect(is_between_zero_and_one).to be true
210
+ end
211
+
212
+ it 'returns -1.0 if features cannot be computed from the edit' do
213
+ allow_any_instance_of(Wikipedia::VandalismDetection::FeatureCalculator)
214
+ .to receive(:calculate_features_for)
215
+ .and_return([])
216
+
217
+ confidence = subject.classify(edit)
218
+
219
+ expect(confidence).to eq(-1.0)
220
+ end
221
+
222
+ describe 'with option ":return_all_params = true"' do
223
+ it 'returns a hash' do
224
+ parameters = subject.classify(features, return_all_params: true)
225
+ expect(parameters).to be_a Hash
226
+ end
227
+
228
+ %i[confidence class_index].each do |key|
229
+ it "returns a hash with key :#{key}" do
230
+ results = subject.classify(features, return_all_params: true)
231
+ expect(results.keys).to include key
232
+ end
233
+ end
234
+
235
+ it 'returns a class_index value of 0 or 1' do
236
+ results = subject.classify(features, return_all_params: true)
237
+ class_index = results[:class_index]
238
+ is_one_or_zero = class_index == 0 || class_index == 1
239
+
240
+ expect(is_one_or_zero).to be true
241
+ end
242
+
243
+ it 'returns an confidence value that is between 0.0 and 1.0' do
244
+ results = subject.classify(features, return_all_params: true)
245
+ confidence = results[:confidence]
246
+ between_zero_and_one = confidence <= 1.0 && confidence >= 0.0
247
+
248
+ expect(between_zero_and_one).to be true
249
+ end
250
+ end
251
+
252
+ it 'raises an argument error if given features are an empty array' do
253
+ expect { subject.classify([]) }.to raise_error ArgumentError
254
+ end
255
+
256
+ it 'it handles NaN return values (i.e. is not implemented)' do
257
+ config = test_config
258
+ config.instance_variable_set(:@classifier_type, 'Meta::OneClassClassifier')
259
+ config.instance_variable_set(:@classifier_options, "-tcl #{Instances::VANDALISM}")
260
+
261
+ use_configuration(config)
262
+
263
+ # add more test instances because instances number must higher than cross
264
+ # validation fold
265
+ instances = TrainingDataset.instances.to_m.to_a
266
+ dataset = Instances.empty
267
+
268
+ vandalism_index = Instances::VANDALISM_CLASS_INDEX
269
+ regular_index = Instances::REGULAR_CLASS_INDEX
270
+
271
+ [vandalism_index, regular_index].each do |index|
272
+ instances.each do |row|
273
+ values = row[0..-2]
274
+ class_value = Instances::CLASSES[index]
275
+ dataset.add_instance([*values, class_value])
276
+ end
277
+ end
278
+
279
+ classifier = Classifier.new(dataset)
280
+ results = classifier.classify(features, return_all_params: true)
281
+ expect(results).to be_a Hash
282
+ end
283
+
284
+ it 'handles one class classification with "regular" as target class' do
285
+ config = test_config
286
+ config.instance_variable_set(:@classifier_type, 'Meta::OneClassClassifier')
287
+ config.instance_variable_set(:@classifier_options, "-tcl #{Instances::REGULAR}")
288
+
289
+ use_configuration(config)
290
+
291
+ # add more test instances because instances number must higher than cross
292
+ # validation fold
293
+ instances = TrainingDataset.instances.to_m.to_a
294
+ dataset = Instances.empty
295
+
296
+ vandalism_index = Instances::VANDALISM_CLASS_INDEX
297
+ regular_index = Instances::REGULAR_CLASS_INDEX
298
+
299
+ [vandalism_index, regular_index].each do |index|
300
+ instances.each do |row|
301
+ values = row[0..-2]
302
+ class_value = Instances::CLASSES[index]
303
+ dataset.add_instance([*values, class_value])
304
+ end
305
+ end
306
+
307
+ classifier = Classifier.new(dataset)
308
+ results = classifier.classify(features, return_all_params: true)
309
+
310
+ expect(results).to be_a Hash
311
+ end
312
+ end
313
+
314
+ describe '#cross_validate' do
315
+ it 'returns an Evaluation object' do
316
+ evaluation = subject.cross_validate
317
+ expect(evaluation).to be_a Java::WekaClassifiers::Evaluation
318
+ end
319
+
320
+ context 'with option "equally distributed"' do
321
+ it 'returns an array of Evaluation objects' do
322
+ evaluations = subject.cross_validate(equally_distributed: true)
323
+
324
+ evaluations.each do |evaluation|
325
+ expect(evaluation).to be_a Java::WekaClassifiers::Evaluation
326
+ end
327
+ end
328
+ end
329
+ end
330
+ end