wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,227 @@
1
+ require 'spec_helper'
2
+ require 'fileutils'
3
+ require 'weka'
4
+
5
+ describe Wikipedia::VandalismDetection::TestDataset do
6
+ subject { Wikipedia::VandalismDetection::TestDataset }
7
+
8
+ before do
9
+ use_test_configuration
10
+ @config = test_config
11
+
12
+ @arff_file = @config.test_output_arff_file
13
+ @index_file = @config.test_output_index_file
14
+ @features = @config.features
15
+
16
+ @arff_files_dir = File.join(@config.output_base_directory, 'test')
17
+ end
18
+
19
+ after do
20
+ if File.exist?(@arff_file)
21
+ File.delete(@arff_file)
22
+ directory = File.dirname(@arff_file)
23
+ FileUtils.rm_r(directory)
24
+ end
25
+
26
+ File.delete(@index_file) if File.exist?(@index_file)
27
+
28
+ # remove feature arff files
29
+ @config.features.each do |name|
30
+ file = File.join(@arff_files_dir, "#{name.tr(' ', '_')}.arff")
31
+
32
+ next unless File.exist?(file)
33
+
34
+ File.delete(file)
35
+ directory = File.dirname file
36
+ FileUtils.rm_r(directory)
37
+ end
38
+ end
39
+
40
+ describe '#build' do
41
+ describe 'exceptions' do
42
+ it 'raises an error if no edits file is configured' do
43
+ config = test_config
44
+ config.instance_variable_set(:@test_corpus_edits_file, nil)
45
+ use_configuration(config)
46
+
47
+ expect { subject.build }.to raise_error \
48
+ Wikipedia::VandalismDetection::EditsFileNotConfiguredError
49
+ end
50
+ end
51
+
52
+ it 'returns a weka instances' do
53
+ expect(subject.build).to be_a Java::WekaCore::Instances
54
+ end
55
+
56
+ Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS['features'].each do |name|
57
+ it "creates an arff file for the feature '#{name}'" do
58
+ config = test_config
59
+ config.instance_variable_set(:@features, [name])
60
+ use_configuration(config)
61
+
62
+ file = File.join(@arff_files_dir, "#{name.tr(' ', '_')}.arff")
63
+
64
+ expect(File.exist?(file)).to be false
65
+ subject.build
66
+ expect(File.exist?(file)).to be true
67
+ end
68
+ end
69
+
70
+ it 'creates only feature files that are not available yet' do
71
+ config = test_config
72
+ config.instance_variable_set(:@features, ['anonymity', 'comment length'])
73
+ use_configuration(config)
74
+
75
+ anonymity_file = File.join(config.output_base_directory, 'test', 'anonymity.arff')
76
+
77
+ # create file manually, so it is existent when building dataset
78
+ data = [1, 2, 3]
79
+ anonymity = Wikipedia::VandalismDetection::Instances.empty_for_test_feature('anonymity')
80
+ 6.times { anonymity.add_instance(data) }
81
+ anonymity.to_arff(anonymity_file)
82
+
83
+ Wikipedia::VandalismDetection::TestDataset.build
84
+
85
+ # anonymity should not be overwritten
86
+ values = Weka::Core::Instances.from_arff(anonymity_file).first.values
87
+ expect(values).to eq data
88
+ end
89
+
90
+ describe 'internal algorithm' do
91
+ it 'builds the right number of data lines' do
92
+ edits_count = File.open(@config.training_corpus_edits_file, 'r').lines.count - 1
93
+ additional_header_lines = 4 # without class
94
+ revision_id_lines = 2 # old and new revision id attributes
95
+ class_line = 1
96
+
97
+ lines_count = [
98
+ additional_header_lines,
99
+ edits_count,
100
+ @features.count,
101
+ revision_id_lines,
102
+ class_line
103
+ ].inject(:+)
104
+
105
+ dataset = subject.build
106
+
107
+ expect(dataset.to_s.lines.count).to eq lines_count
108
+ end
109
+
110
+ it 'builds the right number of data columns' do
111
+ old_and_new_edit_attr_count = 2
112
+ class_value = 1
113
+ dataset = subject.build
114
+
115
+ attributes_count = [
116
+ @features.count,
117
+ class_value,
118
+ old_and_new_edit_attr_count
119
+ ].inject(:+)
120
+
121
+ expect(dataset.attributes_count).to eq attributes_count
122
+ end
123
+
124
+ it 'builds a class attribute' do
125
+ dataset = subject.build
126
+ expect(dataset.attributes.last.name).to eq 'class'
127
+ end
128
+ end
129
+ end
130
+
131
+ describe '#instances' do
132
+ it 'is an alias method for #build' do
133
+ build = subject.method(:build)
134
+ instances = subject.method(:instances)
135
+
136
+ expect(build).to eq instances
137
+ end
138
+ end
139
+
140
+ describe '#create_corpus_index_file!' do
141
+ it 'responds to #create_corpus_file_index!' do
142
+ expect(subject).to respond_to :create_corpus_file_index!
143
+ end
144
+
145
+ describe 'exceptions' do
146
+ it 'raises an error unless revisions directory is configured' do
147
+ config = test_config
148
+ config.instance_variable_set(:@test_corpus_revisions_directory, nil)
149
+ use_configuration(config)
150
+
151
+ expect { subject.create_corpus_file_index! }.to raise_error \
152
+ Wikipedia::VandalismDetection::RevisionsDirectoryNotConfiguredError
153
+ end
154
+ end
155
+
156
+ it 'creates a corpus_index.yml file in the build directory' do
157
+ expect(File.exist?(@index_file)).to be false
158
+ subject.create_corpus_file_index!
159
+ expect(File.exist?(@index_file)).to be true
160
+ end
161
+ end
162
+
163
+ describe '#build!' do
164
+ it { is_expected.to respond_to :build! }
165
+
166
+ it 'creates an .arff file in the configured irectory' do
167
+ expect(File.exist?(@arff_file)).to be false
168
+ subject.build!
169
+ expect(File.exist?(@arff_file)).to be true
170
+ end
171
+
172
+ it 'overwrites existing test arff file' do
173
+ use_test_configuration
174
+
175
+ # test config uses 3 features + 2 edit id columns + 1 class value = 6
176
+ subject.build!
177
+ first_parsed_dataset = Weka::Core::Instances.from_arff(@arff_file)
178
+ expect(first_parsed_dataset.attributes_count).to eq 6
179
+
180
+ config = test_config
181
+ config.instance_variable_set(:@features, ['anonymity'])
182
+ use_configuration(config)
183
+
184
+ # uses only 1 feature + 2 edit id columns + 1 class vlaue = 4
185
+ subject.build!
186
+ second_parsed_dataset = Weka::Core::Instances.from_arff(@arff_file)
187
+
188
+ expect(second_parsed_dataset.attributes_count).to eq 4
189
+ end
190
+ end
191
+
192
+ describe '#edit' do
193
+ it 'raises an error unless edits file is configured' do
194
+ config = test_config
195
+ config.instance_variable_set(:@test_corpus_edits_file, nil)
196
+ use_configuration(config)
197
+
198
+ expect { subject.edit('1', '2') }.to raise_error \
199
+ Wikipedia::VandalismDetection::EditsFileNotConfiguredError
200
+ end
201
+
202
+ it 'returns nil if Edit could not be found' do
203
+ edit = subject.edit('1', '2')
204
+ expect(edit).to be_nil
205
+ end
206
+
207
+ it 'returns an Edit' do
208
+ edit = subject.edit('307084144', '326873205')
209
+ expect(edit).to be_an Edit
210
+ end
211
+
212
+ it 'returns an edit whose parent page title is not nil' do
213
+ edit = subject.edit('307084144', '326873205')
214
+ expect(edit.page.title).to_not be_nil
215
+ end
216
+
217
+ it 'returns an edit whose parent page id is not nil' do
218
+ edit = subject.edit('307084144', '326873205')
219
+ expect(edit.page.id).to_not be_nil
220
+ end
221
+
222
+ it 'returns nil for a not annotated edit with given revision ids' do
223
+ edit = subject.edit('328774088', '328774188')
224
+ expect(edit).to be_nil
225
+ end
226
+ end
227
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Text do
4
+ it { is_expected.to be_a String }
5
+ it { is_expected.to respond_to :clean }
6
+
7
+ describe '#initialze' do
8
+ it 'removes invalid byte sequences' do
9
+ text = Text.new("text \255".force_encoding('UTF-8'))
10
+ expect(text).to eq 'text '
11
+ end
12
+ end
13
+
14
+ describe '#clean' do
15
+ it 'raises an WikitextExtractionError if text cannot be parsed' do
16
+ text = Text.new("[[Image:img.jpg|\n{|\n|-\n|||| |}")
17
+
18
+ expect { text.clean }.to raise_error \
19
+ Wikipedia::VandalismDetection::WikitextExtractionError
20
+ end
21
+
22
+ it 'returns the text cleaned from wiki tags' do
23
+ wiki_text = Text.new(load_file('sample_revision.txt'))
24
+ clean_text = load_file('sample_revision_clean_text.txt')
25
+
26
+ expect(wiki_text.clean).to eq clean_text
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,266 @@
1
+ require 'spec_helper'
2
+ require 'fileutils'
3
+
4
+ describe Wikipedia::VandalismDetection::TrainingDataset do
5
+ before do
6
+ use_test_configuration
7
+ @config = test_config
8
+
9
+ @arff_file = @config.training_output_arff_file
10
+ @index_file = @config.training_output_index_file
11
+ @annotations_file = @config.training_corpus_annotations_file
12
+
13
+ @arff_files_dir = File.join(@config.output_base_directory, 'training')
14
+ end
15
+
16
+ after do
17
+ if File.exist?(@arff_file)
18
+ File.delete(@arff_file)
19
+ directory = File.dirname(@arff_file)
20
+ FileUtils.rm_r(directory)
21
+ end
22
+
23
+ File.delete(@index_file) if File.exist?(@index_file)
24
+
25
+ # remove feature arff files
26
+ @config.features.each do |name|
27
+ file = File.join(@arff_files_dir, "#{name.tr(' ', '_')}.arff")
28
+
29
+ next unless File.exist?(file)
30
+
31
+ File.delete(file)
32
+ directory = File.dirname(file)
33
+ FileUtils.rm_r(directory)
34
+ end
35
+ end
36
+
37
+ describe '#build' do
38
+ it 'returns a weka instances' do
39
+ dataset = TrainingDataset.build
40
+ expect(dataset).to be_a Java::WekaCore::Instances
41
+ end
42
+
43
+ describe 'exceptions' do
44
+ it 'raises error unless edits file is configured' do
45
+ config = test_config
46
+ config.instance_variable_set(:@training_corpus_edits_file, nil)
47
+ use_configuration(config)
48
+
49
+ expect { TrainingDataset.build }.to raise_error \
50
+ Wikipedia::VandalismDetection::EditsFileNotConfiguredError
51
+ end
52
+
53
+ it 'raises error unless annotations file is configured' do
54
+ config = test_config
55
+ config.instance_variable_set(:@training_corpus_annotations_file, nil)
56
+ use_configuration(config)
57
+
58
+ expect { TrainingDataset.build }.to raise_error \
59
+ Wikipedia::VandalismDetection::AnnotationsFileNotConfiguredError
60
+ end
61
+ end
62
+
63
+ Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS['features'].each do |name|
64
+ it "creates an arff file for the feature '#{name}'" do
65
+ config = test_config
66
+ config.instance_variable_set(:@features, [name])
67
+ use_configuration(config)
68
+
69
+ file = File.join(@arff_files_dir, "#{name.tr(' ', '_')}.arff")
70
+
71
+ expect(File.exist?(file)).to be false
72
+ TrainingDataset.build
73
+ expect(File.exist?(file)).to be true
74
+ end
75
+ end
76
+
77
+ it 'creates only feature files that are not available yet' do
78
+ config = test_config
79
+ config.instance_variable_set(:@features, ['anonymity', 'comment length'])
80
+ use_configuration(config)
81
+
82
+ anonymity_file = File.join(config.output_base_directory, 'test', 'anonymity.arff')
83
+
84
+ # create file manually, so it is existent when building the dataset
85
+ data = [1, 2, 3]
86
+ anonymity = Instances.empty_for_test_feature('anonymity')
87
+ 6.times { anonymity.add_instance(data) }
88
+ anonymity.to_arff(anonymity_file)
89
+
90
+ TrainingDataset.build
91
+
92
+ # anonymity should not be overwritten
93
+ values = Weka::Core::Instances.from_arff(anonymity_file).first.values
94
+ expect(values).to eq data
95
+ end
96
+
97
+ describe 'internal algorithm' do
98
+ let(:features_count) { @config.features.count }
99
+
100
+ it 'builds the right number of data lines' do
101
+ dataset = TrainingDataset.build
102
+ annotations_count = File.open(@annotations_file, 'r').lines.count - 1
103
+ additional_header_lines = 5
104
+
105
+ total_lines = additional_header_lines + annotations_count + features_count
106
+
107
+ expect(dataset.to_s.lines.count).to eq total_lines
108
+ end
109
+
110
+ it 'builds the right number of data columns' do
111
+ dataset = TrainingDataset.build
112
+ expect(dataset.attributes_count).to eq features_count + 1
113
+ end
114
+ end
115
+
116
+ describe 'replacing missing values' do
117
+ it 'replaces missing values if configured' do
118
+ config = test_config
119
+ config.instance_variable_set(:@replace_missing_values, 'true')
120
+ use_configuration(config)
121
+
122
+ dataset = TrainingDataset.build
123
+
124
+ filter = /weka\.filters\.unsupervised\.attribute\.ReplaceMissingValues/
125
+ expect(dataset.to_s).to match filter
126
+ end
127
+
128
+ it 'does not replace missing values if not configured' do
129
+ config = test_config
130
+ config.instance_variable_set(:@replace_missing_values, 'Nope')
131
+ use_configuration(config)
132
+
133
+ dataset = TrainingDataset.build
134
+
135
+ filter = /weka\.filters\.unsupervised\.attribute\.ReplaceMissingValues/
136
+ expect(dataset.to_s).not_to match filter
137
+ end
138
+ end
139
+ end
140
+
141
+ describe '#instances' do
142
+ it 'is an alias method for #build' do
143
+ build = TrainingDataset.method(:build)
144
+ instances = TrainingDataset.method(:instances)
145
+
146
+ expect(build).to eq instances
147
+ end
148
+ end
149
+
150
+ describe '#balanced_instances' do
151
+ before do
152
+ config = test_config
153
+ config.instance_variable_set(:@training_data_options, 'balanced')
154
+ use_configuration(config)
155
+
156
+ @dataset = TrainingDataset.balanced_instances
157
+ end
158
+
159
+ it 'returns a weka dataset' do
160
+ expect(@dataset).to be_a Java::WekaCore::Instances
161
+ end
162
+
163
+ it 'returns a dataset of rigth size built from the configured corpus' do
164
+ # 2 vandalism, 2 regular, see resources/corpora/training/annotations.csv
165
+ expect(@dataset.size).to eq 4
166
+ end
167
+
168
+ %i[VANDALISM REGULAR].each do |class_const|
169
+ it "has the right number of '#{class_const.downcase}' samples in its instances" do
170
+ class_count = @dataset.enumerate_instances.reduce(0) do |count, instance|
171
+ label = Instances::CLASSES[instance.class_value.to_i]
172
+ value = Instances.const_get(class_const)
173
+
174
+ label == value ? count + 1 : count
175
+ end
176
+
177
+ expect(class_count).to eq 2
178
+ end
179
+ end
180
+ end
181
+
182
+ describe '#oversampled_instances' do
183
+ describe 'with default options' do
184
+ before do
185
+ config = test_config
186
+ config.instance_variable_set(:@training_data_options, 'oversampled')
187
+ use_configuration(config)
188
+
189
+ # default -P 100 -U true
190
+ @dataset = TrainingDataset.oversampled_instances
191
+ end
192
+
193
+ it 'returns a weka dataset' do
194
+ expect(@dataset).to be_a Java::WekaCore::Instances
195
+ end
196
+
197
+ it 'returns a dataset of size 8 built from the configured corpus' do
198
+ # 4 vandalism, 4 regular, see resources/corpora/training/annotations.csv
199
+ expect(@dataset.size).to eq 8
200
+ end
201
+
202
+ %i[VANDALISM REGULAR].each do |class_const|
203
+ it "has the right number of '#{class_const.downcase}' samples in its instances" do
204
+ class_count = @dataset.enumerate_instances.reduce(0) do |count, instance|
205
+ label = Instances::CLASSES[instance.class_value.to_i]
206
+ value = Instances.const_get(class_const)
207
+
208
+ label == value ? count + 1 : count
209
+ end
210
+
211
+ expect(class_count).to eq 4
212
+ end
213
+ end
214
+
215
+ it 'returns the right-sized SMOTEd dataset from the configured corpus' do
216
+ # 4 vandalism, 4 regular, see resources/corpora/training/annotations.csv
217
+ dataset = TrainingDataset.oversampled_instances(percentage: 200)
218
+ expect(dataset.size).to eq 8
219
+ end
220
+ end
221
+
222
+ describe 'with custom options' do
223
+ before do
224
+ config = test_config
225
+ options = 'oversampled -p 300 -u false'
226
+ config.instance_variable_set(:@training_data_options, options)
227
+ use_configuration(config)
228
+
229
+ @dataset = TrainingDataset.oversampled_instances
230
+ end
231
+
232
+ it 'returns a weka dataset' do
233
+ expect(@dataset).to be_a Java::WekaCore::Instances
234
+ end
235
+
236
+ it 'returns the right dataset size built from the configured corpus' do
237
+ # 2 + 300 % = 8 vandalism, 4 regular,
238
+ # see resources/corpora/training/annotations.csv
239
+ expect(@dataset.size).to eq 12
240
+ end
241
+ end
242
+ end
243
+
244
+ describe '#create_corpus_index_file!' do
245
+ it 'responds to #create_corpus_file_index!' do
246
+ expect(TrainingDataset).to respond_to :create_corpus_file_index!
247
+ end
248
+
249
+ describe 'exceptions' do
250
+ it 'raises an error if no revisions directory is configured' do
251
+ config = test_config
252
+ config.instance_variable_set(:@training_corpus_revisions_directory, nil)
253
+ use_configuration(config)
254
+
255
+ expect { TrainingDataset.create_corpus_file_index! }.to raise_error \
256
+ Wikipedia::VandalismDetection::RevisionsDirectoryNotConfiguredError
257
+ end
258
+ end
259
+
260
+ it 'creates a corpus_index.yml file in the build directory' do
261
+ expect(File.exist?(@index_file)).to be false
262
+ TrainingDataset.create_corpus_file_index!
263
+ expect(File.exist?(@index_file)).to be true
264
+ end
265
+ end
266
+ end
@@ -0,0 +1,97 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::WikitextExtractor do
4
+ subject { Wikipedia::VandalismDetection::WikitextExtractor }
5
+
6
+ describe '.extract' do
7
+ it 'can handle invalid byte sequences' do
8
+ wiki_text = "text \255".force_encoding('UTF-8')
9
+ extracted_text = subject.extract(wiki_text)
10
+
11
+ expect(extracted_text).to eq 'text'
12
+ end
13
+
14
+ it 'returns an empty string if all the markup is extracted' do
15
+ wiki_text = '{{speedy deletion}}'
16
+ extracted_text = subject.extract(wiki_text)
17
+
18
+ expect(extracted_text).to be_empty
19
+ end
20
+
21
+ it 'removes #REDIRECT markup' do
22
+ wiki_text = '#REDIRECT [[Heading]]'
23
+ plain_text = 'Heading'
24
+ extracted_text = subject.extract(wiki_text)
25
+
26
+ expect(extracted_text).to eq plain_text
27
+ end
28
+
29
+ it 'can extract plaintext from wikitext' do
30
+ wiki_text = load_file('sample_revision.txt')
31
+ plain_text = load_file('sample_revision_plain_text.txt')
32
+ extracted_text = subject.extract(wiki_text)
33
+
34
+ expect(extracted_text << "\n").to eq plain_text
35
+ end
36
+
37
+ it 'raises a WikitextExtractionError when extracting unparsable text' do
38
+ unparsable_wiki_text = "[[Image:img.jpg|\n{|\n|-\n|||| |}"
39
+
40
+ expect { subject.extract(unparsable_wiki_text) }
41
+ .to raise_error Wikipedia::VandalismDetection::WikitextExtractionError
42
+ end
43
+ end
44
+
45
+ describe '.extract_clean' do
46
+ it 'can extract full cleaned text from wikitext' do
47
+ wiki_text = load_file('sample_revision.txt')
48
+ clean_text = load_file('sample_revision_clean_text.txt')
49
+ extracted_text = subject.extract_clean(wiki_text)
50
+
51
+ expect(extracted_text).to eq clean_text
52
+ end
53
+
54
+ it 'removes section numbering while cleaning wikitext' do
55
+ wiki_text = "1.1. header 1\n\n1.2. header 2"
56
+ clean_text = 'header 1 header 2'
57
+ extracted_text = subject.extract_clean(wiki_text)
58
+
59
+ expect(extracted_text).to eq clean_text
60
+ end
61
+
62
+ it 'removes line breaks while cleaning wikitext' do
63
+ wiki_text = "line 1\n\nline 2\nline 3"
64
+ clean_text = 'line 1 line 2 line 3'
65
+ extracted_text = subject.extract_clean(wiki_text)
66
+
67
+ expect(extracted_text).to eq clean_text
68
+ end
69
+
70
+ it 'removes multiple spaces while cleaning wikitext' do
71
+ wiki_text = "line 1 \n\nline 2 \nline 3 "
72
+ clean_text = 'line 1 line 2 line 3'
73
+ extracted_text = subject.extract_clean(wiki_text)
74
+
75
+ expect(extracted_text).to eq clean_text
76
+ end
77
+
78
+ it 'removes links from text while cleaning wikitext' do
79
+ wiki_text = "A link\nis here http://example.com/image.jpg not\nanymore." \
80
+ "\n==Reference==\n" \
81
+ '*[http://www.itis.usda.gov/servlet/SingleRpt/SingleRpt?' \
82
+ 'search_topic=TSN&amp;search_value=180211 ITIS 180211] 2002-12-14'
83
+
84
+ clean_text = 'A link is here not anymore. Reference ITIS 180211 2002-12-14'
85
+ extracted_text = subject.extract_clean(wiki_text)
86
+
87
+ expect(extracted_text).to eq clean_text
88
+ end
89
+
90
+ it 'raises a WikitextExtractionError while extracting unparsable text' do
91
+ unparsable_wiki_text = "[[Image:img.jpg|\n{|\n|-\n|||| |}"
92
+
93
+ expect { subject.extract_clean(unparsable_wiki_text) }
94
+ .to raise_error Wikipedia::VandalismDetection::WikitextExtractionError
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,82 @@
1
+ require 'spec_helper'
2
+
3
+ describe Weka::Classifiers::Meta::OneClassClassifier do
4
+ it { is_expected.to be_a Java::WekaClassifiersMeta::OneClassClassifier }
5
+
6
+ let(:classifier_type) { 'Meta::OneClassClassifier' }
7
+
8
+ before do
9
+ @config = test_config
10
+ classifier_options = '-W weka.classifiers.trees.RandomForest -- -I 100'
11
+ @w_options = "-W weka.classifiers.meta.Bagging -- #{classifier_options}"
12
+ vandalism = Wikipedia::VandalismDetection::Instances::VANDALISM
13
+ options = "-tcl #{vandalism} #{@w_options}"
14
+
15
+ @config.instance_variable_set(:@classifier_type, classifier_type)
16
+ @config.instance_variable_set(:@classifier_options, options)
17
+ @config.instance_variable_set(:@cross_validation_fold, 2)
18
+
19
+ use_configuration(@config)
20
+
21
+ # add more test instances because instances number must be higher than
22
+ # cross validation fold:
23
+ data = Wikipedia::VandalismDetection::TrainingDataset.instances.to_m.to_a
24
+ dataset = Wikipedia::VandalismDetection::Instances.empty
25
+
26
+ 2.times do
27
+ data.each do |row|
28
+ values = row[0..-2]
29
+ index = rand((0..1))
30
+ class_value = Wikipedia::VandalismDetection::Instances::CLASSES[index]
31
+
32
+ dataset.add_instance([*values, class_value])
33
+ end
34
+ end
35
+
36
+ allow(Wikipedia::VandalismDetection::TrainingDataset)
37
+ .to receive(:instances)
38
+ .and_return(dataset)
39
+ end
40
+
41
+ after do
42
+ arff_file = @config.training_output_arff_file
43
+ build_dir = @config.output_base_directory
44
+
45
+ if File.exist?(arff_file)
46
+ File.delete(arff_file)
47
+ directory = File.dirname(arff_file)
48
+ FileUtils.rm_r(directory)
49
+ end
50
+
51
+ FileUtils.rm_r(build_dir) if Dir.exist?(build_dir)
52
+ end
53
+
54
+ it 'can be used to classify vandalism' do
55
+ classifier = Wikipedia::VandalismDetection::Classifier.new
56
+ features = [1.0, 2.0, 55.0]
57
+
58
+ expect(classifier.classify(features)).to be_between(0.0, 1.0)
59
+ end
60
+
61
+ it 'can be used to classify vandalism using regulars' do
62
+ regular = Wikipedia::VandalismDetection::Instances::REGULAR
63
+ options = "-tcl #{regular} #{@w_options}"
64
+
65
+ @config.instance_variable_set(:@classifier_type, classifier_type)
66
+ @config.instance_variable_set(:@classifier_options, options)
67
+
68
+ use_configuration(@config)
69
+
70
+ classifier = Wikipedia::VandalismDetection::Classifier.new
71
+ features = [1.0, 2.0, 8.0]
72
+
73
+ expect(classifier.classify(features)).to be_between(0.0, 1.0)
74
+ end
75
+
76
+ describe '#type' do
77
+ it 'returns the classifier’s type name' do
78
+ expect(Weka::Classifiers::Meta::OneClassClassifier.type)
79
+ .to eq 'Meta::OneClassClassifier'
80
+ end
81
+ end
82
+ end