wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,135 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::FeatureCalculator do
4
+ let(:edit) { build(:edit) }
5
+
6
+ it 'raises NoFeaturesConfiguredError when no features are configured' do
7
+ config = Wikipedia::VandalismDetection::Configuration.send(:new)
8
+ config.instance_variable_set(:@features, nil)
9
+
10
+ use_configuration(config)
11
+
12
+ expect { Wikipedia::VandalismDetection::FeatureCalculator.new }
13
+ .to raise_error Wikipedia::VandalismDetection::FeaturesNotConfiguredError
14
+ end
15
+
16
+ before do
17
+ use_test_configuration
18
+ @calculator = Wikipedia::VandalismDetection::FeatureCalculator.new
19
+ end
20
+
21
+ describe '#calculate_features_for' do
22
+ it { is_expected.to respond_to :calculate_features_for }
23
+
24
+ it 'takes an edit as parameter' do
25
+ expect { @calculator.calculate_features_for(edit) }
26
+ .not_to raise_error ArgumentError
27
+ end
28
+
29
+ it 'raises an error if called with wrong parameter type' do
30
+ revision = build(:empty_revision)
31
+ expect { @calculator.calculate_features_for(revision) }
32
+ .to raise_error ArgumentError
33
+ end
34
+
35
+ it 'returns an array' do
36
+ expect(@calculator.calculate_features_for(edit)).to be_an Array
37
+ end
38
+
39
+ it 'returns the computed numeric feature values' do
40
+ feature_values = @calculator.calculate_features_for(edit)
41
+ expect(feature_values.all? { |value| value.is_a?(Numeric) }).to be true
42
+ end
43
+
44
+ it 'returns the right number of feature values' do
45
+ count = @calculator.used_features.count
46
+ expect(@calculator.calculate_features_for(edit).count).to eq count
47
+ end
48
+
49
+ it 'uses the cleaned up text if revision contains a #REDIRECT' do
50
+ redirect_text = Text.new('#REDIRECT [[Redirect page]]')
51
+ old_revision_redirect = build(:old_revision, text: redirect_text)
52
+ new_revision_redirect = build(:new_revision, text: redirect_text)
53
+ old_revision = build(:old_revision)
54
+ new_revision = build(:new_revision)
55
+
56
+ edit_a = Wikipedia::VandalismDetection::Edit.new(old_revision_redirect, new_revision)
57
+ edit_b = Wikipedia::VandalismDetection::Edit.new(old_revision, new_revision_redirect)
58
+
59
+ config = Wikipedia::VandalismDetection.config
60
+ count = config.features.count
61
+
62
+ expect(@calculator.calculate_features_for(edit_a).count).to eq count
63
+ expect(@calculator.calculate_features_for(edit_b).count).to eq count
64
+ end
65
+
66
+ it 'includes -1 for not extractable texts in either revision' do
67
+ config = Wikipedia::VandalismDetection::Configuration.instance
68
+ config.instance_variable_set(:@features, ['all wordlists impact'])
69
+
70
+ use_configuration(config)
71
+
72
+ unparsable_wiki_text = Text.new("[[Image:img.jpg|\n{|\n|-\n|||| |}")
73
+
74
+ old_revision_unparsable = build(:old_revision, text: unparsable_wiki_text)
75
+ new_revision_unparsable = build(:new_revision, text: unparsable_wiki_text)
76
+
77
+ old_revision = build(:old_revision)
78
+ new_revision = build(:new_revision)
79
+
80
+ edit_a = Wikipedia::VandalismDetection::Edit.new(old_revision_unparsable, new_revision)
81
+ edit_b = Wikipedia::VandalismDetection::Edit.new(old_revision, new_revision_unparsable)
82
+
83
+ expect(subject.calculate_features_for(edit_a)).to include Features::MISSING_VALUE
84
+ expect(subject.calculate_features_for(edit_b)).to include Features::MISSING_VALUE
85
+ end
86
+ end
87
+
88
+ describe '#claculate_feature_for' do
89
+ let(:feature_name) { 'anonymity' }
90
+ let(:random_number) { rand(1000) }
91
+ let(:empty_revision) { build(:empty_revision) }
92
+
93
+ before do
94
+ allow_any_instance_of(Features::Anonymity)
95
+ .to receive(:calculate)
96
+ .and_return(random_number)
97
+ end
98
+
99
+ it { is_expected.to respond_to :calculate_feature_for }
100
+
101
+ it 'takes an edit and feature name as parameter' do
102
+ expect { @calculator.calculate_feature_for(edit, feature_name) }
103
+ .not_to raise_error ArgumentError
104
+ end
105
+
106
+ it 'raises an error if called with wrong parameter type edit' do
107
+ expect { @calculator.calculate_feature_for(empty_revision, feature_name) }
108
+ .to raise_error ArgumentError
109
+ end
110
+
111
+ it 'raises an error if called with wrong parameter type feature name' do
112
+ expect { @calculator.calculate_feature_for(edit, empty_revision) }
113
+ .to raise_error ArgumentError
114
+ end
115
+
116
+ it 'returns a Numeric' do
117
+ expect(@calculator.calculate_feature_for(edit, feature_name))
118
+ .to be_a Numeric
119
+ end
120
+
121
+ it 'returns the value calculated by the feature class' do
122
+ expect(@calculator.calculate_feature_for(edit, feature_name))
123
+ .to eq random_number
124
+ end
125
+ end
126
+
127
+ describe '#used_features' do
128
+ it { is_expected.to respond_to :used_features }
129
+
130
+ it 'returns an array of the features defined in the config feature.yml' do
131
+ features = Wikipedia::VandalismDetection.config.features
132
+ expect(@calculator.used_features).to match_array features
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::AllWordlistsFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the inserted number of all lists words over all inserted' do
8
+ # inserted: total 7 words, 1 vulgarism, 1 biased, 1 pronouns = 3 bad
9
+ old_text = Text.new('Your old shit. ')
10
+ new_text = Text.new('Your old shit. Fuck you great, and all the others.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 7.0
17
+ end
18
+
19
+ it 'returns 0.0 on empty clean inserted text' do
20
+ old_text = Text.new('Your old shit. ')
21
+ new_text = Text.new('Your old shit. {{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,52 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::AllWordlistsImpact do
4
+ it { is_expected.to be_a Features::ImpactBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the impact of all wordlists words of the new revision text' do
8
+ # 1 vulgarism, 2 pronouns, 0 biased = 3 bad
9
+ old_text = Text.new('Fuck you, you and all the others')
10
+ # 3 vulgarism, 3 pronouns, 1 biased = 7 bad
11
+ new_text = Text.new('Fuck you great, fuck you, you all others sluts')
12
+
13
+ old_rev = build(:old_revision, text: old_text)
14
+ new_rev = build(:new_revision, text: new_text)
15
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
16
+
17
+ expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 7.0)
18
+ end
19
+
20
+ it 'returns 0.5 if both text revisions have no terms' do
21
+ text = Text.new('{{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: text)
24
+ new_rev = build(:new_revision, text: text)
25
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.5
28
+ end
29
+
30
+ it 'returns 0.0 for an emtpy clean text in the old revision' do
31
+ old_text = Text.new('{{speedy deletion}}')
32
+ new_text = Text.new('fuck')
33
+
34
+ old_rev = build(:old_revision, text: old_text)
35
+ new_rev = build(:new_revision, text: new_text)
36
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
37
+
38
+ expect(subject.calculate(edit)).to eq 0.0
39
+ end
40
+
41
+ it 'returns 1.0 for an emtpy clean text in the new revision' do
42
+ old_text = Text.new('fuck')
43
+ new_text = Text.new('{{speedy deletion}}')
44
+
45
+ old_rev = build(:old_revision, text: old_text)
46
+ new_rev = build(:new_revision, text: new_text)
47
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
48
+
49
+ expect(subject.calculate(edit)).to eq 1.0
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,67 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::AnonymityPrevious do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ context 'both contributors are given' do
8
+ it 'return 1.0 in case of an registered previous editor' do
9
+ old_rev = build(:old_revision, contributor: 'Peter')
10
+ new_rev = build(:new_revision)
11
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
12
+
13
+ expect(subject.calculate(edit)).to eq 1
14
+ end
15
+
16
+ it 'returns 0.0 in case of an anonymous previous editor' do
17
+ old_rev = build(:old_revision, contributor: '137.163.16.199')
18
+ new_rev = build(:new_revision)
19
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
20
+
21
+ expect(subject.calculate(edit)).to eq 0
22
+ end
23
+ end
24
+
25
+ context 'previous contributor not given' do
26
+ context 'for a registered previous editor' do
27
+ it 'requests the user from Wikipedia API and returns 1' do
28
+ old_rev = build(:old_revision, id: '324557983', contributor: nil)
29
+ new_rev = build(
30
+ :new_revision,
31
+ id: '329962649',
32
+ parent_id: '324557983',
33
+ contributor: 'Tomaxer'
34
+ )
35
+
36
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
37
+
38
+ expect(subject.calculate(edit)).to eq 1
39
+ end
40
+ end
41
+
42
+ context 'for an anonymous previous editor' do # also same editor!
43
+ it 'requests the user from Wikipedia API and returns 0' do
44
+ old_rev = build(:old_revision, id: '328774110', contributor: nil)
45
+ new_rev = build(:new_revision, id: '328774035', parent_id: '328774110')
46
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
47
+
48
+ expect(subject.calculate(edit)).to eq 0
49
+ end
50
+ end
51
+
52
+ context 'if old reivision is not available anymore' do
53
+ it 'returns missing' do
54
+ # to get api call, see:
55
+ # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=timestamp&revids=325218985
56
+ # <rev revid="325218985"/>
57
+
58
+ old_rev = build(:old_revision, id: '325218985', contributor: nil)
59
+ new_rev = build(:new_revision, id: '326980599', parent_id: '325218985')
60
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
61
+
62
+ expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::Anonymity do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'return 1.0 in case of an registered editor' do
8
+ edit = build :edit, new_revision: build(:registered_revision)
9
+ expect(subject.calculate(edit)).to eq 1
10
+ end
11
+
12
+ it 'returns 0.0 in case of an anonymous editor' do
13
+ edit = build :edit, new_revision: build(:anonymous_revision)
14
+ expect(subject.calculate(edit)).to eq 0
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::ArticleSize do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the size of the edit’s new revisions' do
8
+ old_rev_text = Text.new('123')
9
+ new_rev_text = Text.new('123 456789') # size 10 (with spaces)
10
+
11
+ old_rev = build(:old_revision, text: old_rev_text)
12
+ new_rev = build(:new_revision, text: new_rev_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 10
16
+ end
17
+
18
+ it "returns 0 if the edit's new revisions is empty" do
19
+ old_rev_text = Text.new('123')
20
+ new_rev_text = Text.new # size 0
21
+
22
+ old_rev = build(:old_revision, text: old_rev_text)
23
+ new_rev = build(:new_revision, text: new_rev_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 0
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::BadFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of vulgarism words relative to all words count' do
8
+ # total 6 words, 3 bad
9
+ old_text = Text.new('Old whatever.')
10
+ new_text = Text.new('Old whatever. New ugly contents. Hi, gotta fun.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 6.0
17
+ end
18
+
19
+ it 'returns 0.0 for an emtpy clean text in the new revision' do
20
+ old_text = Text.new('Old guy.')
21
+ new_text = Text.new('Old guy. {{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,53 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::BadImpact do
4
+ it { is_expected.to be_a Features::ImpactBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the impact of bad words of the edit’s new revision text' do
8
+ # 3 bad words
9
+ old_text = Text.new('Hi, old text 666, dont know')
10
+
11
+ # 4 bad words
12
+ new_text = Text.new('Hi, new text dosent, whatever, guy')
13
+
14
+ old_rev = build(:old_revision, text: old_text)
15
+ new_rev = build(:new_revision, text: new_text)
16
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
17
+
18
+ expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 4.0)
19
+ end
20
+
21
+ it 'returns 0.5 on both no terms in text revisions' do
22
+ text = Text.new('{speedy deletion}}')
23
+
24
+ old_rev = build(:old_revision, text: text)
25
+ new_rev = build(:new_revision, text: text)
26
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
27
+
28
+ expect(subject.calculate(edit)).to eq 0.5
29
+ end
30
+
31
+ it 'returns 0.0 on emtpy clean text of old revision' do
32
+ old_text = Text.new('{{speedy deletion}}')
33
+ new_text = Text.new('Guy')
34
+
35
+ old_rev = build(:old_revision, text: old_text)
36
+ new_rev = build(:new_revision, text: new_text)
37
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
38
+
39
+ expect(subject.calculate(edit)).to eq 0.0
40
+ end
41
+
42
+ it 'returns 1.0 on emtpy clean text of new revision' do
43
+ old_text = Text.new('Guy')
44
+ new_text = Text.new('{{speedy deletion}}')
45
+
46
+ old_rev = build(:old_revision, text: old_text)
47
+ new_rev = build(:new_revision, text: new_text)
48
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
49
+
50
+ expect(subject.calculate(edit)).to eq 1.0
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,41 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::Base do
4
+ describe '#count' do
5
+ let(:text) { 'I, you: i will help You' }
6
+
7
+ it { is_expected.to respond_to(:count).with(2).arguments }
8
+
9
+ it 'raises an error if option :in is not defined' do
10
+ expect { subject.count(%i[i you], from: text) }
11
+ .to raise_error ArgumentError
12
+ end
13
+
14
+ it 'raises an error if terms is not an Array or String' do
15
+ expect { subject.count({ term: 'You' }, in: text) }
16
+ .to raise_error ArgumentError
17
+ end
18
+
19
+ it 'returns the total number of terms found for the given terms array' do
20
+ terms = %i[i you]
21
+ expect(subject.count(terms, in: text)).to eq 4
22
+ end
23
+
24
+ it 'returns the number of terms found for the given single term' do
25
+ expect(subject.count('You', in: text)).to eq 2
26
+ end
27
+ end
28
+
29
+ describe '#calculate' do
30
+ it { is_expected.to respond_to :calculate }
31
+
32
+ it 'takes an Wikipedia::Edit as argument' do
33
+ edit = build(:edit)
34
+ expect { subject.calculate(edit) }.not_to raise_error ArgumentError
35
+ end
36
+
37
+ it 'raises an ArgumentError if argument is no Wikipedia::Edit' do
38
+ expect { subject.calculate('string') }.to raise_error ArgumentError
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::BiasedFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the inserted number of biased words over all inserted words' do
8
+ # inserted: total 7 words, 3 biased
9
+ old_text = Text.new('Great old.')
10
+ new_text = Text.new('Great old. This is so great, really a classic.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 7.0
17
+ end
18
+
19
+ it 'returns 0.0 on emtpy clean inserted text' do
20
+ old_text = Text.new('Great old.')
21
+ new_text = Text.new('Great old. {{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,52 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::BiasedImpact do
4
+ it { is_expected.to be_a Features::ImpactBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the impact of biased words of the new revision text' do
8
+ # 1 vulgarism
9
+ old_text = Text.new('this is classic!')
10
+ # 3 vulgarism
11
+ new_text = Text.new('This is classic, legendary and amazing!')
12
+
13
+ old_rev = build(:old_revision, text: old_text)
14
+ new_rev = build(:new_revision, text: new_text)
15
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
16
+
17
+ expect(subject.calculate(edit)).to eq 1.0 / (1.0 + 3.0)
18
+ end
19
+
20
+ it 'returns 0.5 on both no terms in text revisions' do
21
+ text = Text.new('{{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: text)
24
+ new_rev = build(:new_revision, text: text)
25
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.5
28
+ end
29
+
30
+ it 'returns 0.0 on emtpy clean text of old revision' do
31
+ old_text = Text.new('{{speedy deletion}}')
32
+ new_text = Text.new('great')
33
+
34
+ old_rev = build(:old_revision, text: old_text)
35
+ new_rev = build(:new_revision, text: new_text)
36
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
37
+
38
+ expect(subject.calculate(edit)).to eq 0.0
39
+ end
40
+
41
+ it 'returns 1.0 on emtpy clean text of new revision' do
42
+ old_text = Text.new('great')
43
+ new_text = Text.new('{{speedy deletion}}')
44
+
45
+ old_rev = build(:old_revision, text: old_text)
46
+ new_rev = build(:new_revision, text: new_text)
47
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
48
+
49
+ expect(subject.calculate(edit)).to eq 1.0
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,35 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::Blanking do
4
+ let(:blank_text) { 'a' * (Features::Blanking::BLANKING_THRESHOLD - 1) }
5
+
6
+ it { is_expected.to be_a Features::Base }
7
+
8
+ describe '#calculate' do
9
+ it 'returns 1.0 in case of full blanking the new revision' do
10
+ # full blanking means size < BLANKING_THRESHOLD.
11
+ old_rev = build(:old_revision, text: "#{blank_text} additional text")
12
+ new_rev = build(:new_revision, text: blank_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 1.0
16
+ end
17
+
18
+ it 'returns 0.0 in case of not full blanking the new revision' do
19
+ # not full blanking means size > BLANKING_THRESHOLD.
20
+ old_rev = build(:old_revision, text: "#{blank_text} additional text")
21
+ new_rev = build(:new_revision, text: "#{blank_text}a")
22
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
23
+
24
+ expect(subject.calculate(edit)).to eq 0.0
25
+ end
26
+
27
+ it 'returns 0.0 if old revision is <= new revision' do
28
+ old_rev = build(:old_revision, text: blank_text)
29
+ new_rev = build(:new_revision, text: blank_text.next!)
30
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
31
+
32
+ expect(subject.calculate(edit)).to eq 0.0
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::CharacterDiversity do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the character diversity of the new inserted text' do
8
+ old_text = Text.new('text')
9
+ # 9 unique characters of total 14
10
+ new_text = Text.new('text [[aa ab cdeefg]]')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 14**(1.0 / 9)
17
+ end
18
+
19
+ it 'returns 0.0 if no text inserted' do
20
+ old_text = Text.new('deletion text')
21
+ new_text = Text.new('text')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,31 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::CharacterSequence do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of the new revision’s longest character sequence' do
8
+ old_text = Text.new('a 666666')
9
+ new_text = Text.new("a 666666 4444ccc eefffff gggg g ''fffaffff''")
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 5
17
+ end
18
+
19
+ it 'returns 0 if no text was inserted' do
20
+ old_text = Text.new('a 666666 4444ccc eeeefffff gggg g')
21
+ new_text = Text.new('a 666666 ')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+
26
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
27
+
28
+ expect(subject.calculate(edit)).to eq 0
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::CommentBadFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of bad words in comment over all words' do
8
+ # total 11 words, 7 bad words
9
+ comment = Text.new('666 was 666 if 666 was 666 and guy are 666')
10
+
11
+ old_rev = build(:old_revision)
12
+ new_rev = build(:new_revision, comment: comment)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 6.0 / 11.0
16
+ end
17
+
18
+ it 'returns 0.0 on emtpy clean text comment' do
19
+ comment = Text.new('{{speedy deletion}}')
20
+
21
+ old_rev = build(:old_revision)
22
+ new_rev = build(:new_revision, comment: comment)
23
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
24
+
25
+ expect(subject.calculate(edit)).to eq 0.0
26
+ end
27
+ end
28
+ end