wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,601 @@
1
+ require 'spec_helper'
2
+ require 'yaml'
3
+
4
+ describe Wikipedia::VandalismDetection do
5
+ DEFAULTS = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS
6
+
7
+ describe 'Configuration class' do
8
+ before do
9
+ allow_any_instance_of(Wikipedia::VandalismDetection::DefaultConfiguration)
10
+ .to receive(:source)
11
+ .and_return(source_dir)
12
+
13
+ @config = Wikipedia::VandalismDetection::Configuration.instance
14
+
15
+ use_test_configuration
16
+ end
17
+
18
+ %i[
19
+ data
20
+ features
21
+ classifier_type
22
+ classifier_options
23
+ cross_validation_fold
24
+ training_data_options
25
+ balanced_training_data?
26
+ unbalanced_training_data?
27
+ oversampled_training_data?
28
+ test_output_classification_file
29
+ oversampling_options
30
+ training_output_arff_file
31
+ test_output_arff_file
32
+ replace_training_data_missing_values?
33
+ ].each do |attribute|
34
+ it "responds to ##{attribute}" do
35
+ expect(@config).to respond_to attribute
36
+ end
37
+ end
38
+
39
+ it 'returns a hash for #data (the full config hash)' do
40
+ expect(@config.data).to be_a Hash
41
+ end
42
+
43
+ it 'returns a feature array for #feature' do
44
+ expect(@config.features).to be_an Array
45
+ end
46
+
47
+ it 'returns a numeric for #cross-validation-fold' do
48
+ expect(@config.cross_validation_fold).to be_a Numeric
49
+ end
50
+
51
+ describe '#test_output_classification_file' do
52
+ it 'returns the classification file path extended by classifier name and training data options' do
53
+ file_path = @config.test_output_classification_file
54
+ classifier_name = @config.classifier_type.split('::').last.downcase
55
+ dataset_options = @config.training_data_options
56
+ file_name = DEFAULTS['output']['test']['classification_file']
57
+
58
+ path = File.join(
59
+ @config.output_base_directory,
60
+ classifier_name,
61
+ dataset_options,
62
+ file_name
63
+ )
64
+
65
+ expect(file_path).to eq path
66
+ end
67
+ end
68
+
69
+ describe 'output arff files' do
70
+ describe '#training_output_arff_file' do
71
+ it 'returns the arff file path extended by classifier name and training data options' do
72
+ file_path = @config.training_output_arff_file
73
+ classifier_name = @config.classifier_type.split('::').last.downcase
74
+ dataset_options = @config.training_data_options
75
+ file_name = DEFAULTS['output']['training']['arff_file']
76
+
77
+ path = File.join(
78
+ @config.output_base_directory,
79
+ classifier_name,
80
+ dataset_options,
81
+ file_name
82
+ )
83
+
84
+ expect(file_path).to eq path
85
+ end
86
+ end
87
+
88
+ describe '#test_output_arff_file' do
89
+ it 'returns the arff file path extended by classifier name and training data options' do
90
+ file_path = @config.test_output_arff_file
91
+ classifier_name = @config.classifier_type.split('::').last.downcase
92
+ dataset_options = @config.training_data_options
93
+ file_name = DEFAULTS['output']['test']['arff_file']
94
+
95
+ path = File.join(
96
+ @config.output_base_directory,
97
+ classifier_name,
98
+ dataset_options,
99
+ file_name
100
+ )
101
+
102
+ expect(file_path).to eq path
103
+ end
104
+ end
105
+ end
106
+
107
+ describe '#use_occ?' do
108
+ it 'returns true if used classifier is a one class classifier' do
109
+ config = test_config
110
+ classifier_type = Weka::Classifiers::Meta::OneClassClassifier.type
111
+ config.instance_variable_set(:@classifier_type, classifier_type)
112
+ use_configuration(config)
113
+ use_occ = Wikipedia::VandalismDetection.config.use_occ?
114
+
115
+ expect(use_occ).to be true
116
+ end
117
+
118
+ it 'returns false if used classifier isn’t a one class classifier' do
119
+ use_test_configuration
120
+ use_occ = Wikipedia::VandalismDetection.config.use_occ?
121
+
122
+ expect(use_occ).to be false
123
+ end
124
+ end
125
+
126
+ describe '#balanced_training_data?' do
127
+ context 'if it is set in config' do
128
+ it 'returns true' do
129
+ config = test_config
130
+ config.instance_variable_set(:@training_data_options, 'balanced')
131
+ use_configuration(config)
132
+ balanced = Wikipedia::VandalismDetection.config.balanced_training_data?
133
+
134
+ expect(balanced).to be true
135
+ end
136
+ end
137
+
138
+ context 'if it is not set in config' do
139
+ it 'returns false ' do
140
+ config = test_config
141
+ config.instance_variable_set(:@training_data_options, nil)
142
+ use_configuration(config)
143
+ balanced = Wikipedia::VandalismDetection.config.balanced_training_data?
144
+
145
+ expect(balanced).to be false
146
+ end
147
+ end
148
+
149
+ context 'if it is set to other value than "balanced"' do
150
+ it 'returns false' do
151
+ config = test_config
152
+ config.instance_variable_set(:@training_data_options, 'other')
153
+ use_configuration(config)
154
+ balanced = Wikipedia::VandalismDetection.config.balanced_training_data?
155
+
156
+ expect(balanced).to be false
157
+ end
158
+ end
159
+ end
160
+
161
+ describe '#unbalanced_training_data?' do
162
+ context 'if it is set in config' do
163
+ it 'returns true' do
164
+ config = test_config
165
+ config.instance_variable_set(:@training_data_options, 'unbalanced')
166
+ use_configuration(config)
167
+ unbalanced = Wikipedia::VandalismDetection.config.unbalanced_training_data?
168
+
169
+ expect(unbalanced).to be true
170
+ end
171
+ end
172
+
173
+ context 'if it is not set in config' do
174
+ it 'returns true' do
175
+ config = test_config
176
+ config.instance_variable_set(:@training_data_options, nil)
177
+ use_configuration(config)
178
+ unbalanced = Wikipedia::VandalismDetection.config.unbalanced_training_data?
179
+
180
+ expect(unbalanced).to be true
181
+ end
182
+ end
183
+
184
+ context 'if set to other value than "unbalanced" or "oversampled"' do
185
+ it 'returns true' do
186
+ config = test_config
187
+ config.instance_variable_set(:@training_data_options, 'other value')
188
+ use_configuration(config)
189
+ unbalanced = Wikipedia::VandalismDetection.config.unbalanced_training_data?
190
+
191
+ expect(unbalanced).to be true
192
+ end
193
+ end
194
+
195
+ context 'if it is set to other value than "unbalanced"' do
196
+ it 'returns false' do
197
+ config = test_config
198
+ config.instance_variable_set(:@training_data_options, 'balanced')
199
+ use_configuration(config)
200
+ unbalanced = Wikipedia::VandalismDetection.config.unbalanced_training_data?
201
+
202
+ expect(unbalanced).to be false
203
+ end
204
+ end
205
+ end
206
+
207
+ describe '#oversampled_training_data?' do
208
+ context 'if it is set in config' do
209
+ it 'returns true' do
210
+ config = test_config
211
+ config.instance_variable_set(:@training_data_options, 'oversampled')
212
+ use_configuration(config)
213
+ oversampled = Wikipedia::VandalismDetection.config.oversampled_training_data?
214
+
215
+ expect(oversampled).to be true
216
+ end
217
+ end
218
+
219
+ context 'if it is not set in config' do
220
+ it 'returns false' do
221
+ config = test_config
222
+ config.instance_variable_set(:@training_data_options, nil)
223
+ use_configuration(config)
224
+ oversampled = Wikipedia::VandalismDetection.config.oversampled_training_data?
225
+
226
+ expect(oversampled).to be false
227
+ end
228
+ end
229
+
230
+ context 'if it is set to other value than "balanced"' do
231
+ it 'returns false' do
232
+ config = test_config
233
+ config.instance_variable_set(:@training_data_options, 'other')
234
+ use_configuration(config)
235
+ oversampled = Wikipedia::VandalismDetection.config.oversampled_training_data?
236
+
237
+ expect(oversampled).to be false
238
+ end
239
+ end
240
+ end
241
+
242
+ describe '#oversampled_options' do
243
+ it 'returns a hash' do
244
+ expect(@config.oversampling_options).to be_a Hash
245
+ end
246
+
247
+ it 'returns a hash with the :percent and :undersampling keys' do
248
+ config = test_config
249
+ config.instance_variable_set(:@training_data_options, 'oversampled')
250
+ use_configuration(config)
251
+
252
+ options = Wikipedia::VandalismDetection.config.oversampling_options
253
+ hash = { percentage: 0, undersampling: true }
254
+
255
+ expect(options.keys).to eq hash.keys
256
+ end
257
+
258
+ context 'if training data is not oversampled' do
259
+ it 'returns an empty hash' do
260
+ config = test_config
261
+ config.instance_variable_set(:@training_data_options, 'other')
262
+ use_configuration(config)
263
+ options = Wikipedia::VandalismDetection.config.oversampling_options
264
+
265
+ expect(options).to eq({})
266
+ end
267
+ end
268
+
269
+ describe 'Returning of configured options' do
270
+ let(:percentage) { 300.0 }
271
+ let(:undersampling) { 200.0 }
272
+
273
+ let(:sampling_options) do
274
+ {
275
+ percentage: percentage,
276
+ undersampling: undersampling
277
+ }
278
+ end
279
+
280
+ it 'returns the configured options with downcase params' do
281
+ config = test_config
282
+ options = "oversampled -p #{percentage} -u true #{undersampling}"
283
+ config.instance_variable_set(:@training_data_options, options)
284
+ use_configuration(config)
285
+ hash = Wikipedia::VandalismDetection.config.oversampling_options
286
+
287
+ expect(hash).to eq sampling_options
288
+ end
289
+
290
+ it 'returns the configured options with upcase params' do
291
+ config = test_config
292
+ options = "oversampled -P #{percentage} -U true #{undersampling}"
293
+ config.instance_variable_set(:@training_data_options, options)
294
+ use_configuration(config)
295
+ hash = Wikipedia::VandalismDetection.config.oversampling_options
296
+
297
+ expect(hash).to eq sampling_options
298
+ end
299
+
300
+ it 'returns the configured options with full params' do
301
+ config = test_config
302
+ options = "oversampled -Percentage #{percentage} -Undersampling true #{undersampling}"
303
+ config.instance_variable_set(:@training_data_options, options)
304
+ use_configuration(config)
305
+ hash = Wikipedia::VandalismDetection.config.oversampling_options
306
+
307
+ expect(hash).to eq sampling_options
308
+ end
309
+ end
310
+
311
+ it 'returns a default value for percent if not set' do
312
+ percentage = 100 # default value
313
+ undersampling = 200
314
+ sampling_options = {
315
+ percentage: percentage,
316
+ undersampling: undersampling
317
+ }
318
+
319
+ config = test_config
320
+ options = "oversampled -u true #{undersampling}"
321
+ config.instance_variable_set(:@training_data_options, options)
322
+ use_configuration(config)
323
+ hash = Wikipedia::VandalismDetection.config.oversampling_options
324
+
325
+ expect(hash).to eq sampling_options
326
+ end
327
+
328
+ it 'returns a default true for undersampling if not set' do
329
+ percentage = 200
330
+ undersampling = 100 # default value
331
+ sampling_options = {
332
+ percentage: percentage,
333
+ undersampling: undersampling
334
+ }
335
+
336
+ config = test_config
337
+ options = "oversampled -p #{percentage}"
338
+ config.instance_variable_set(:@training_data_options, options)
339
+ use_configuration(config)
340
+ hash = Wikipedia::VandalismDetection.config.oversampling_options
341
+
342
+ expect(hash).to eq sampling_options
343
+ end
344
+
345
+ it 'returns a percentange value for undersampling if set in -u option' do
346
+ percentage = 200
347
+ undersampling = 0.001
348
+ sampling_options = {
349
+ percentage: percentage,
350
+ undersampling: undersampling
351
+ }
352
+
353
+ config = test_config
354
+ options = "oversampled -p #{percentage} -u true #{undersampling}"
355
+ config.instance_variable_set(:@training_data_options, options)
356
+ use_configuration(config)
357
+ hash = Wikipedia::VandalismDetection.config.oversampling_options
358
+
359
+ expect(hash).to eq sampling_options
360
+ end
361
+ end
362
+
363
+ describe '#replace_missing_values?' do
364
+ ['no', 'No', 'false', 'nope', '', nil].each do |option|
365
+ it 'returns false if not set' do
366
+ config = test_config
367
+ config.instance_variable_set(:@replace_missing_values, option)
368
+ use_configuration(config)
369
+ replace = Wikipedia::VandalismDetection.config.replace_training_data_missing_values?
370
+
371
+ expect(replace).to be false
372
+ end
373
+ end
374
+
375
+ %w[yes t T YES True true].each do |option|
376
+ it 'returns true if set' do
377
+ config = test_config
378
+ config.instance_variable_set(:@replace_missing_values, option)
379
+ use_configuration(config)
380
+ replace = Wikipedia::VandalismDetection.config.replace_training_data_missing_values?
381
+
382
+ expect(replace).to be true
383
+ end
384
+ end
385
+ end
386
+
387
+ %i[
388
+ training_corpus_edits_file
389
+ training_corpus_annotations_file
390
+ training_corpus_revisions_directory
391
+ test_corpus_edits_file
392
+ test_corpus_revisions_directory
393
+ test_corpus_ground_truth_file
394
+ training_output_arff_file
395
+ training_output_index_file
396
+ test_output_arff_file
397
+ test_output_index_file
398
+ classifier_type
399
+ classifier_options
400
+ output_base_directory
401
+ ].each do |attribute|
402
+ it "returns a string when calling ##{attribute}" do
403
+ expect(@config.send(attribute)).to be_a String
404
+ end
405
+ end
406
+ end
407
+
408
+ describe '#configuration' do
409
+ it 'can be overridden by a wikipedia-vandalism-detection.yml file' do
410
+ allow_any_instance_of(Wikipedia::VandalismDetection::DefaultConfiguration)
411
+ .to receive(:source)
412
+ .and_return(source_dir)
413
+
414
+ default_config = DEFAULTS
415
+ config_file = '../../resources/config/wikipedia-vandalism-detection.yml'
416
+ custom_config = YAML.load_file(File.expand_path(config_file, __FILE__))
417
+
418
+ expect(Wikipedia::VandalismDetection.config.data)
419
+ .to eq default_config.deep_merge(custom_config)
420
+ end
421
+
422
+ it 'returns a Wikipedia::VandalismDetection::Configuration' do
423
+ expect(Wikipedia::VandalismDetection.config)
424
+ .to be_a Wikipedia::VandalismDetection::Configuration
425
+ end
426
+
427
+ it 'has all features as default configuration' do
428
+ use_default_configuration
429
+
430
+ features = [
431
+ 'anonymity',
432
+ 'anonymity previous',
433
+ 'all wordlists frequency',
434
+ 'all wordlists impact',
435
+ 'article size',
436
+ 'bad frequency',
437
+ 'bad impact',
438
+ 'biased frequency',
439
+ 'biased impact',
440
+ 'blanking',
441
+ 'character sequence',
442
+ 'character diversity',
443
+ 'comment length',
444
+ 'comment biased frequency',
445
+ 'comment pronoun frequency',
446
+ 'comment vulgarism frequency',
447
+ 'compressibility',
448
+ 'copyedit',
449
+ 'digit ratio',
450
+ 'edits per user',
451
+ 'emoticons frequency',
452
+ 'emoticons impact',
453
+ 'inserted size',
454
+ 'inserted words',
455
+ 'inserted character distribution',
456
+ 'inserted external links',
457
+ 'inserted internal links',
458
+ 'longest word',
459
+ 'markup frequency',
460
+ 'markup impact',
461
+ 'non-alphanumeric ratio',
462
+ 'personal life',
463
+ 'pronoun frequency',
464
+ 'pronoun impact',
465
+ 'removed size',
466
+ 'removed words',
467
+ 'removed all wordlists frequency',
468
+ 'removed bad frequency',
469
+ 'removed biased frequency',
470
+ 'removed character distribution',
471
+ 'removed emoticons frequency',
472
+ 'removed markup frequency',
473
+ 'removed pronoun frequency',
474
+ 'removed sex frequency',
475
+ 'removed vulgarism frequency',
476
+ 'replacement similarity',
477
+ 'reverted',
478
+ 'revisions character distribution',
479
+ 'sex frequency',
480
+ 'sex impact',
481
+ 'same editor',
482
+ 'size increment',
483
+ 'size ratio',
484
+ 'term frequency',
485
+ 'time interval',
486
+ 'time of day',
487
+ 'upper case ratio',
488
+ 'upper case words ratio',
489
+ 'upper to lower case ratio',
490
+ 'vulgarism frequency',
491
+ 'vulgarism impact',
492
+ 'weekday',
493
+ 'words increment'
494
+ ]
495
+
496
+ configured_features = Wikipedia::VandalismDetection.config['features']
497
+
498
+ expect(configured_features).to eq features
499
+ end
500
+
501
+ describe '#configuration#corpora' do
502
+ before do
503
+ use_default_configuration
504
+ @corpora = Wikipedia::VandalismDetection.config['corpora']
505
+ end
506
+
507
+ it 'has a corpora config' do
508
+ expect(@corpora).to be_a Hash
509
+ end
510
+
511
+ %i[training test].each do |attribute|
512
+ it "has a #{attribute}-corpus config" do
513
+ expect(@corpora[attribute.to_s]).to be_a Hash
514
+ end
515
+ end
516
+
517
+ it 'has a default nil corpora-base_directory config' do
518
+ expect(@corpora['base_directory']).to be_nil
519
+ end
520
+
521
+ %i[
522
+ base_directory
523
+ revisions_directory
524
+ edits_file
525
+ annotations_file
526
+ ].each do |attribute|
527
+ it "has an default nil '#{attribute}' config for the training-corpus" do
528
+ expect(@corpora['training'][attribute.to_s]).to be_nil
529
+ end
530
+ end
531
+
532
+ %i[
533
+ base_directory
534
+ revisions_directory
535
+ edits_file
536
+ ].each do |attribute|
537
+ it "has an default nil '#{attribute}' config for the test-corpus" do
538
+ expect(@corpora['test'][attribute.to_s]).to be_nil
539
+ end
540
+ end
541
+ end
542
+
543
+ describe '#configuration#output' do
544
+ before do
545
+ use_default_configuration
546
+ @output_config = Wikipedia::VandalismDetection.config['output']
547
+ end
548
+
549
+ it 'has an output-config' do
550
+ expect(@output_config).to be_a Hash
551
+ end
552
+
553
+ describe 'output sub configs' do
554
+ it 'has a default "base_directory" output-config' do
555
+ expect(@output_config['base_directory']).to_not be_nil
556
+ end
557
+
558
+ %i[arff_file index_file].each do |attribute|
559
+ it "has a default '#{attribute}' config for the training-output" do
560
+ output_training_config = @output_config['training']
561
+ expect(output_training_config).to_not be_nil
562
+ expect(output_training_config[attribute.to_s]).to_not be_nil
563
+ end
564
+ end
565
+
566
+ %i[arff_file index_file].each do |attribute|
567
+ it "has a default '#{attribute}' config for the test-output" do
568
+ output_test_config = @output_config['training']
569
+ expect(output_test_config).to_not be nil
570
+ expect(output_test_config[attribute.to_s]).to_not be nil
571
+ end
572
+ end
573
+ end
574
+ end
575
+
576
+ describe '#configuration#classifier' do
577
+ before do
578
+ use_default_configuration
579
+ @classifier = Wikipedia::VandalismDetection.config['classifier']
580
+ end
581
+
582
+ it 'return a classifier Hash' do
583
+ expect(@classifier).to be_a Hash
584
+ end
585
+
586
+ %i[type options].each do |attribute|
587
+ it "has a default nil '#{attribute}' config for classification" do
588
+ expect(@classifier[attribute.to_s]).to be_nil
589
+ end
590
+ end
591
+
592
+ it 'has a default 10-fold CV config for evaluation' do
593
+ expect(@classifier['cross-validation-fold']).to eq 10
594
+ end
595
+
596
+ it 'has has unbalances training data by default for training' do
597
+ expect(@classifier['training-data-options']).to eq 'unbalanced'
598
+ end
599
+ end
600
+ end
601
+ end
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Diff do
4
+ let(:old_text) { Text.new("hello\nworld\nmy name is Luke\n") }
5
+ let(:new_text) { Text.new("world\nhello\nmy name is Mr. Skywalker\n") }
6
+ let(:diff) { Wikipedia::VandalismDetection::Diff.new(old_text, new_text) }
7
+
8
+ it 'can deal with invalid byte sequences' do
9
+ text = "text \255".force_encoding('UTF-8')
10
+ diff = Wikipedia::VandalismDetection::Diff.new("#{text} a", "#{text} b")
11
+
12
+ expect(diff).to be_a Wikipedia::VandalismDetection::Diff
13
+ end
14
+
15
+ describe '#inserted_words' do
16
+ let(:words) { diff.inserted_words }
17
+
18
+ it 'returns the inserted words as array' do
19
+ expect(words).to be_an Array
20
+ expect(words.count).to eq 3
21
+ end
22
+
23
+ it 'returns the right inserted words' do
24
+ expect(words).to eq %w[hello Mr. Skywalker]
25
+ end
26
+ end
27
+
28
+ describe '#removed_words' do
29
+ let(:words) { diff.removed_words }
30
+
31
+ it 'returns the removed words as array' do
32
+ expect(words).to be_an Array
33
+ expect(words.count).to eq 2
34
+ end
35
+
36
+ it 'returns the right removed words' do
37
+ expect(words).to eq %w[hello Luke]
38
+ end
39
+ end
40
+ end