wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::CommentBiasedFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of biased words in comment over all words' do
8
+ # total 10 words, 3 biased
9
+ comment = Text.new('It’s Great man, this is amazing, really a classic.')
10
+
11
+ old_rev = build(:old_revision)
12
+ new_rev = build(:new_revision, comment: comment)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 4.0 / 9.0
16
+ end
17
+
18
+ it 'returns 0.0 for an emtpy clean text comment in the new revision' do
19
+ comment = Text.new('{{speedy deletion}}')
20
+
21
+ old_rev = build(:old_revision)
22
+ new_rev = build(:new_revision, comment: comment)
23
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
24
+
25
+ expect(subject.calculate(edit)).to eq 0.0
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,21 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::CommentLength do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the length of the new revisions comment' do
8
+ comment = Text.new('1 34567 9')
9
+ edit = build :edit, new_revision: build(:new_revision, comment: comment)
10
+
11
+ expect(subject.calculate(edit)).to eq 9
12
+ end
13
+
14
+ it 'returns 0 on emtpy clean text' do
15
+ text = Text.new('{{speedy deletion}}')
16
+ edit = build :edit, new_revision: build(:new_revision, text: text)
17
+
18
+ expect(subject.calculate(edit)).to eq 0
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::CommentMarkupFrequency do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of markup words in comment over all words' do
8
+ # total 7 words, 3 markup
9
+ comment = Text.new('[[Content]] is not always {{simple}} to [[produce]]')
10
+
11
+ old_rev = build(:old_revision)
12
+ new_rev = build(:new_revision, comment: comment)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 3.0 / 7.0
16
+ end
17
+
18
+ it 'returns 0.0 on emtpy text comment' do
19
+ comment = Text.new
20
+
21
+ old_rev = build(:old_revision)
22
+ new_rev = build(:new_revision, comment: comment)
23
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
24
+
25
+ expect(subject.calculate(edit)).to eq 0.0
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::CommentPronounFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of pronouns in comment over all words count' do
8
+ # total 12 words, 7 pronouns
9
+ comment = Text.new('I was you if You was Me and we are ourselves us')
10
+
11
+ old_rev = build(:old_revision)
12
+ new_rev = build(:new_revision, comment: comment)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 7.0 / 12.0
16
+ end
17
+
18
+ it 'returns 0.0 on emtpy clean text comment' do
19
+ comment = Text.new('{{speedy deletion}}')
20
+
21
+ old_rev = build(:old_revision)
22
+ new_rev = build(:new_revision, comment: comment)
23
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
24
+
25
+ expect(subject.calculate(edit)).to eq 0.0
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::CommentSexFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of sex words in comment over all words' do
8
+ # total 9 words, 5 sex words
9
+ comment = Text.new('Penis was penis if penis was penis and anal')
10
+
11
+ old_rev = build(:old_revision)
12
+ new_rev = build(:new_revision, comment: comment)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 5.0 / 9.0
16
+ end
17
+
18
+ it 'returns 0.0 on emtpy clean text comment' do
19
+ comment = Text.new('{{speedy deletion}}')
20
+
21
+ old_rev = build(:old_revision)
22
+ new_rev = build(:new_revision, comment: comment)
23
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
24
+
25
+ expect(subject.calculate(edit)).to eq 0.0
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::CommentVulgarismFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of vulgarism words in comment over all words' do
8
+ # total 7 words, 2 vulgarism
9
+ comment = Text.new('Fuck you bitch. This is my change!')
10
+
11
+ old_rev = build(:old_revision)
12
+ new_rev = build(:new_revision, comment: comment)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 2.0 / 7.0
16
+ end
17
+
18
+ it 'returns 0.0 on emtpy clean text comment' do
19
+ comment = Text.new('{{speedy deletion}}')
20
+
21
+ old_rev = build(:old_revision)
22
+ new_rev = build(:new_revision, comment: comment)
23
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
24
+
25
+ expect(subject.calculate(edit)).to eq 0.0
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,38 @@
1
+ require 'spec_helper'
2
+ require 'zlib'
3
+
4
+ describe Wikipedia::VandalismDetection::Features::Compressibility do
5
+ it { is_expected.to be_a Features::Base }
6
+
7
+ describe '#calculate' do
8
+ it 'returns the ratio of compressed text size to uncompressed text size' do
9
+ old_text = 'text'
10
+ new_text = 'text [[If this is a quite long textpart]] of normal words ' \
11
+ 'then it might be less possible to be a vandalism.'
12
+
13
+ old_rev = build(:old_revision, text: Text.new(old_text))
14
+ new_rev = build(:new_revision, text: Text.new(new_text))
15
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
16
+
17
+ bytesize = 10.0
18
+
19
+ allow(Zlib::Deflate).to receive(:deflate).and_return(Text.new)
20
+ allow_any_instance_of(Text).to receive(:bytesize).and_return(bytesize)
21
+
22
+ ratio = bytesize / (bytesize + bytesize)
23
+
24
+ expect(subject.calculate(edit)).to eq ratio
25
+ end
26
+
27
+ it 'returns 0.5 on emtpy inserted text' do
28
+ old_text = Text.new('deletion text')
29
+ new_text = Text.new(' text')
30
+
31
+ old_rev = build(:old_revision, text: old_text)
32
+ new_rev = build(:new_revision, text: new_text)
33
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
34
+
35
+ expect(subject.calculate(edit)).to eq 0.5
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::ContainsBase do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#contains' do
7
+ it 'returns 1 if a given text contains the given terms array' do
8
+ text = 'Content including text'
9
+ expect(subject.contains(text, %w[content anything])).to eq 1
10
+ end
11
+
12
+ it 'returns 1 if a given text contains the given string' do
13
+ text = 'Content including text'
14
+ expect(subject.contains(text, 'content')).to eq 1
15
+ end
16
+
17
+ it 'returns 0 if a given text does not contain the given string' do
18
+ text = 'not containing anything con tent'
19
+ expect(subject.contains(text, 'content')).to eq 0
20
+ end
21
+
22
+ it 'returns 0 if a given text does not contain any of the given terms' do
23
+ text = 'not containing anything con tent'
24
+ expect(subject.contains(text, %w[content text])).to eq 0
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::Copyedit do
4
+ it { is_expected.to be_a Features::ContainsBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns 1 if the edit comment includes "copyedit"' do
8
+ comment = Text.new('copyediting content')
9
+ new_rev = build(:new_revision, comment: comment)
10
+ edit = build(:edit, new_revision: new_rev)
11
+
12
+ expect(subject.calculate(edit)).to eq 1
13
+ end
14
+
15
+ it 'returns 1 if the edit comment includes "copy edit"' do
16
+ comment = Text.new('copy editing content')
17
+ new_rev = build(:new_revision, comment: comment)
18
+ edit = build(:edit, new_revision: new_rev)
19
+
20
+ expect(subject.calculate(edit)).to eq 1
21
+ end
22
+
23
+ it 'returns 0 for emtpy an comment in new revision' do
24
+ new_rev = build(:new_revision, comment: '')
25
+ edit = build :edit, new_revision: new_rev
26
+
27
+ expect(subject.calculate(edit)).to eq 0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::DigitRatio do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the digit to all letters ratio for the new inserted text' do
8
+ old_text = Text.new('text1')
9
+ # 3 digit letters of total 8 letters
10
+ new_text = Text.new('text1 [[1A4 B6 8Cd]]')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq((1.0 + 4) / (1.0 + 8))
17
+ end
18
+
19
+ it 'returns 0.0 if no text was inserted' do
20
+ old_text = Text.new('deletion text')
21
+ new_text = Text.new('text')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::EditsPerUser do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ describe 'online' do
8
+ it 'returns the number of previous edits from same IP or ID' do
9
+ # https://en.wikipedia.org/w/api.php?action=query&format=json&list=usercontribs&ucuser=<name or ip>&ucprop=ids
10
+ old_rev = build(:old_revision, id: '527136737')
11
+ new_rev = build(
12
+ :new_revision,
13
+ id: '527137015',
14
+ parent_id: '527136737',
15
+ contributor: '142.11.81.219',
16
+ timestamp: '2012-12-09T05:30:07Z'
17
+ )
18
+
19
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
20
+
21
+ expect(subject.calculate(edit)).to eq 1
22
+ end
23
+ end
24
+
25
+ describe 'offline' do
26
+ before do
27
+ page = build(:page, id: '1234', title: 'Page Title')
28
+
29
+ # contributor: see factories/page.rb !
30
+ old_rev = build(:new_revision, contributor: 'User')
31
+ new_rev = build(:even_newer_revision, contributor: 'User')
32
+
33
+ @edit = build(
34
+ :edit,
35
+ old_revision: old_rev,
36
+ new_revision: new_rev,
37
+ page: page
38
+ )
39
+ end
40
+
41
+ it 'does not use an API call if the edit has a page reference' do
42
+ expect(Wikipedia).to_not receive :api_request
43
+ subject.calculate(@edit)
44
+ end
45
+
46
+ it 'returns the number of previous edits from the same IP or ID' do
47
+ expect(subject.calculate(@edit)).to eq 1
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::EmoticonsFrequency do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of emoticons over all words' do
8
+ # total 8 words, 3 emoticons
9
+ old_text = Text.new('Old :-).')
10
+ new_text = Text.new('Old :-). ;) love icons and emoticons? :D :P, yeah.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 8.0
17
+ end
18
+
19
+ it 'returns 0.0 on emtpy clean text revisions' do
20
+ old_text = Text.new('Old :-).')
21
+ new_text = Text.new('Old :-). {{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::EmoticonsImpact do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the impact of emoticon words of the new revision text' do
8
+ # 3 emoticons
9
+ old_text = ':) Hi you I got some :-X, you know ;)'
10
+
11
+ # 4 emoticons
12
+ new_text = ':) Hi (=you) I added another :-X you know ;)? (='
13
+
14
+ old_rev = build(:old_revision, text: old_text)
15
+ new_rev = build(:new_revision, text: new_text)
16
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
17
+
18
+ expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 4.0)
19
+ end
20
+
21
+ it 'returns 0.5 if both text revisions have no terms' do
22
+ old_rev = build(:old_revision, text: '')
23
+ new_rev = build(:new_revision, text: '')
24
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 0.5
27
+ end
28
+
29
+ it 'returns 0.0 for an emtpy text in the old revision' do
30
+ old_rev = build(:old_revision, text: '')
31
+ new_rev = build(:new_revision, text: ':)')
32
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
33
+
34
+ expect(subject.calculate(edit)).to eq 0.0
35
+ end
36
+
37
+ it 'returns 1.0 for an emtpy text in the new revision' do
38
+ old_rev = build(:old_revision, text: ':)')
39
+ new_rev = build(:new_revision, text: '')
40
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
41
+
42
+ expect(subject.calculate(edit)).to eq 1.0
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,20 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::FrequencyBase do
4
+ let(:terms) { Wikipedia::VandalismDetection::WordLists::PRONOUNS }
5
+
6
+ it { is_expected.to be_a Features::Base }
7
+
8
+ describe '#frequency' do
9
+ it { is_expected.to respond_to :frequency }
10
+
11
+ it 'returns the frequency in percentage of given word counts' do
12
+ text = 'I am, i like you.'
13
+ expect(subject.frequency(text, terms)).to eq 3.0 / 5.0
14
+ end
15
+
16
+ it 'returns 0.0 if total word count is zero' do
17
+ expect(subject.frequency('', terms)).to eq 0.0
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+ require 'wikipedia/vandalism_detection/features/impact_base'
3
+
4
+ describe Wikipedia::VandalismDetection::Features::ImpactBase do
5
+ let(:pronouns) { Wikipedia::VandalismDetection::WordLists::PRONOUNS }
6
+
7
+ it { is_expected.to be_a Features::Base }
8
+
9
+ describe '#impact' do
10
+ it { is_expected.to respond_to :impact }
11
+
12
+ it 'returns the impact in % of given terms in old realitve to new text' do
13
+ # 3 pronouns
14
+ old_text = 'Your old text will be mine or Yours'
15
+ # 4 pronouns
16
+ new_text = 'My new text and your old text will be ours and mine'
17
+
18
+ expect(subject.impact(old_text, new_text, pronouns))
19
+ .to eq 3.0 / (3.0 + 4.0)
20
+ end
21
+
22
+ it 'returns 0.0 if old terms word count is zero' do
23
+ new_text = 'My new text and your old text will be ours and mine'
24
+ expect(subject.impact('', new_text, pronouns)).to eq 0.0
25
+ end
26
+
27
+ it 'returns 1.0 if new terms word count is zero' do
28
+ old_text = 'My new text and your old text will be ours and mine'
29
+ expect(subject.impact(old_text, '', pronouns)).to eq 1.0
30
+ end
31
+
32
+ it 'returns 0.5 if both terms word count is zero' do
33
+ expect(subject.impact('', '', pronouns)).to eq 0.5
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::InsertedCharacterDistribution do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the KL-Divergence of the inserted characters distribution' do
8
+ old_text = Text.new('old text')
9
+ new_text = Text.new('old text [[new inserted text]] given dero 9')
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 1.6609633564650683
16
+ end
17
+
18
+ it 'returns missing value if no alphanumeric characters were inserted' do
19
+ old_text = Text.new('old text')
20
+ new_text = Text.new('old text !* [[?]]')
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
27
+ end
28
+
29
+ it 'returns missing value if no text was inserted' do
30
+ old_text = Text.new('deletion text')
31
+ new_text = Text.new('text')
32
+
33
+ old_rev = build(:old_revision, text: old_text)
34
+ new_rev = build(:new_revision, text: new_text)
35
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
36
+
37
+ expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::InsertedExternalLinks do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of inserted external links' do
8
+ old_text = Text.new('123')
9
+ new_text = Text.new('123 [http://google.com Google] https://example.com')
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 2
16
+ end
17
+
18
+ it 'returns 0 if no text was inserted' do
19
+ old_text = Text.new('123 456789')
20
+ new_text = Text.new('123') # 0 inserted
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 0
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::InsertedInternalLinks do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of inserted internal links' do
8
+ old_text = Text.new('123')
9
+ new_text = Text.new('123 [[link]] [[linkname|link]]')
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 2
16
+ end
17
+
18
+ it 'returns 0 if no inserted text' do
19
+ old_text = Text.new('123 456789')
20
+ new_text = Text.new('123') # 0 inserted
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 0
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::InsertedSize do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the size of the new revisions inserted text sizes' do
8
+ old_text = Text.new('123')
9
+ new_text = Text.new('123 456789') # 6 inserted
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 6
16
+ end
17
+
18
+ it 'returns 0 if no inserted text' do
19
+ old_text = Text.new('123 456789')
20
+ new_text = Text.new('123') # 0 inserted
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 0
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::InsertedWords do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of the inserted words' do
8
+ old_text = Text.new('zero')
9
+ new_text = Text.new('zero one two three four five six')
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 6
16
+ end
17
+
18
+ it 'returns 0 if no inserted text' do
19
+ old_text = Text.new('zero one')
20
+ new_text = Text.new('zero')
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 0
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::LongestWord do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the length of the longest word in the new revision text' do
8
+ old_text = Text.new('1 7777777')
9
+ new_text = Text.new("1 7777777 22 a2c4e 333 55555\n======head======\nfff")
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 5
16
+ end
17
+
18
+ it 'returns 0 on non inserted clean text' do
19
+ old_text = Text.new('1 22')
20
+ new_text = Text.new('1 22 {{speedy deletion}}')
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 0
27
+ end
28
+ end
29
+ end