wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,31 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::MarkupFrequency do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of markup related words over all inserted words' do
8
+ # total 4 words, 3 markup
9
+ old_text = Text.new('Old whatever.')
10
+ new_text = Text.new('Old whatever. {{template}} <ref>list</ref> [[heading]] boy.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 4.0
17
+ end
18
+
19
+ it 'returns 0.0 on no inserted text' do
20
+ text = 'Old guy.'
21
+ old_text = Text.new(text)
22
+ new_text = Text.new(text)
23
+
24
+ old_rev = build(:old_revision, text: old_text)
25
+ new_rev = build(:new_revision, text: new_text)
26
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
27
+
28
+ expect(subject.calculate(edit)).to eq 0.0
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,53 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::MarkupImpact do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the impact of markup words of the edit’s new revision text' do
8
+ # 3 markup words
9
+ old_text = '{{template}} <ref>reference</ref> [[hello]] hello'
10
+
11
+ # 4 markup words
12
+ new_text = '{{template}} <ref>reference</ref> [[hello]] cite dude'
13
+
14
+ old_rev = build(:old_revision, text: old_text)
15
+ new_rev = build(:new_revision, text: new_text)
16
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
17
+
18
+ expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 4.0)
19
+ end
20
+
21
+ it 'returns 0.5 on both no terms in text revisions' do
22
+ text = ''
23
+
24
+ old_rev = build(:old_revision, text: text)
25
+ new_rev = build(:new_revision, text: text)
26
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
27
+
28
+ expect(subject.calculate(edit)).to eq 0.5
29
+ end
30
+
31
+ it 'returns 0.0 on emtpy text of old revision' do
32
+ old_text = ''
33
+ new_text = '{{template}}'
34
+
35
+ old_rev = build(:old_revision, text: old_text)
36
+ new_rev = build(:new_revision, text: new_text)
37
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
38
+
39
+ expect(subject.calculate(edit)).to eq 0.0
40
+ end
41
+
42
+ it 'returns 1.0 on emtpy text of new revision' do
43
+ old_text = '{{template}}'
44
+ new_text = ''
45
+
46
+ old_rev = build(:old_revision, text: old_text)
47
+ new_rev = build(:new_revision, text: new_text)
48
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
49
+
50
+ expect(subject.calculate(edit)).to eq 1.0
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::NonAlphanumericRatio do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the non-alphanum to all letters ratio of the inserted text' do
8
+ old_text = Text.new('t$xt')
9
+ # 7 non-alphanumeric letters of total 15 letters
10
+ new_text = Text.new('t$xt [[1A$% 4B6]] 8Cd?')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq((1.0 + 7) / (1.0 + 15))
17
+ end
18
+
19
+ it 'returns 0.0 if no text was inserted' do
20
+ old_text = Text.new('deletion text')
21
+ new_text = Text.new('text')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,22 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::PersonalLife do
4
+ it { is_expected.to be_a Features::ContainsBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns 1 if the edit comment includes "personal life"' do
8
+ comment = Text.new('/* Personal life */ edited')
9
+ new_rev = build(:new_revision, comment: comment)
10
+ edit = build(:edit, new_revision: new_rev)
11
+
12
+ expect(subject.calculate(edit)).to eq 1
13
+ end
14
+
15
+ it 'returns 0 on emtpy comment' do
16
+ new_rev = build(:new_revision, comment: '')
17
+ edit = build(:edit, new_revision: new_rev)
18
+
19
+ expect(subject.calculate(edit)).to eq 0
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::PronounFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of pronouns relative to all words count' do
8
+ # total 10 words, 6 pronouns
9
+ old_text = Text.new('Your old.')
10
+ new_text = Text.new('Your old. I was you if You was we are ourselves us.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 6.0 / 10.0
17
+ end
18
+
19
+ it 'returns 0.0 for an emtpy clean text in the new revision' do
20
+ old_text = Text.new('Your old.')
21
+ new_text = Text.new('Your old. {{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,53 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::PronounImpact do
4
+ it { is_expected.to be_a Features::ImpactBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the impact of pronouns of the new revision text' do
8
+ # 3 pronouns
9
+ old_text = Text.new('Your old text will be mine or Your’s')
10
+
11
+ # 4 pronouns
12
+ new_text = Text.new('My new text and your old text will be ours and mine')
13
+
14
+ old_rev = build(:old_revision, text: old_text)
15
+ new_rev = build(:new_revision, text: new_text)
16
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
17
+
18
+ expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 4.0)
19
+ end
20
+
21
+ it 'returns 0.5 if both text revisions include no terms' do
22
+ text = Text.new('{{speedy deletion}}')
23
+
24
+ old_rev = build(:old_revision, text: text)
25
+ new_rev = build(:new_revision, text: text)
26
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
27
+
28
+ expect(subject.calculate(edit)).to eq 0.5
29
+ end
30
+
31
+ it 'returns 0.0 for emtpy clean text of old revision' do
32
+ old_text = Text.new('{{speedy deletion}}')
33
+ new_text = Text.new('You')
34
+
35
+ old_rev = build(:old_revision, text: old_text)
36
+ new_rev = build(:new_revision, text: new_text)
37
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
38
+
39
+ expect(subject.calculate(edit)).to eq 0.0
40
+ end
41
+
42
+ it 'returns 1.0 for emtpy clean text of new revision' do
43
+ old_text = Text.new('You')
44
+ new_text = Text.new('{{speedy deletion}}')
45
+
46
+ old_rev = build(:old_revision, text: old_text)
47
+ new_rev = build(:new_revision, text: new_text)
48
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
49
+
50
+ expect(subject.calculate(edit)).to eq 1.0
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedAllWordlistsFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of removed lists words over all removed words' do
8
+ # inserted: total 7 words, 1 vulgarism, 1 biased, 2 pronouns = 4 bad
9
+ old_text = Text.new('Your old shit. Fuck you great, you and the others.')
10
+ new_text = Text.new('Your old shit.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 4.0 / 7.0
17
+ end
18
+
19
+ it 'returns 0.0 for an empty removed clean text' do
20
+ old_text = Text.new('Your old shit. {{speedy deletion}}')
21
+ new_text = Text.new('Your old shit.')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedBadFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of removed bad words over all removed words' do
8
+ # inserted: total 10 words, 4 biased
9
+ old_text = Text.new('666 old. It’s 666 man, this is 666, 666 a whatever.')
10
+ new_text = Text.new('666 old.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 4.0 / 9.0
17
+ end
18
+
19
+ it 'returns 0.0 for an emtpy removed clean text' do
20
+ old_text = Text.new('whatever old. {{speedy deletion}}')
21
+ new_text = Text.new('whatever old. whatever new.')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedBiasedFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of removed biased words over removed words count' do
8
+ # inserted: total 7 words, 3 biased (great, really, classic)
9
+ old_text = Text.new('Great old. This is so great, really a classic.')
10
+ new_text = Text.new('Great old.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 7.0
17
+ end
18
+
19
+ it 'returns 0.0 on emtpy removed clean text' do
20
+ old_text = Text.new('Great old. {{speedy deletion}}')
21
+ new_text = Text.new('Great old. Great new.')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedCharacterDistribution do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the KL-Divergence of the removed characters distribution' do
8
+ old_text = Text.new('old text [[new inserted text]] given dero 9')
9
+ new_text = Text.new('old text')
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 1.6609633564650683
16
+ end
17
+
18
+ it 'returns a missing value if no alphanumeric characters were removed' do
19
+ old_text = Text.new('old text !* [[?]]')
20
+ new_text = Text.new('old text')
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
27
+ end
28
+
29
+ it 'returns a missing value if no text was inserted' do
30
+ old_text = Text.new('text')
31
+ new_text = Text.new('deletion text')
32
+
33
+ old_rev = build(:old_revision, text: old_text)
34
+ new_rev = build(:new_revision, text: new_text)
35
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
36
+
37
+ expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedEmoticonsFrequency do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of removed emoticon words over all removed words' do
8
+ # inserted: total 6 words, 2 emoticons
9
+ old_text = Text.new(':) old. It’s :P man:Pio, this is X-D.')
10
+ new_text = Text.new(':) old.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 2.0 / 6.0
17
+ end
18
+
19
+ it 'returns 0.0 on emtpy removed text' do
20
+ old_text = Text.new('Great old. {{speedy deletion}}')
21
+ new_text = Text.new('Great old. {{speedy deletion}} :)')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedMarkupFrequency do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of removed markup words over all removed words' do
8
+ # inserted: total 5 removed words, 2 markup
9
+ old_text = Text.new('[[Great]] old. It is [[Great]] man, [[amazing]].')
10
+ new_text = Text.new('[[Great]] old.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 2.0 / 5.0
17
+ end
18
+
19
+ it 'returns 0.0 on emtpy removed text' do
20
+ old_text = Text.new('Great old. {{speedy deletion}}')
21
+ new_text = Text.new('Great old. {{speedy deletion}} [[heading]]')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedPronounFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of removed pronouns over all removed words' do
8
+ # total 10 words, 6 pronouns
9
+ old_text = Text.new('Your old. I was you if You was We are ourselves us.')
10
+ new_text = Text.new('Your old.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 6.0 / 10.0
17
+ end
18
+
19
+ it 'returns 0.0 for an emtpy removed clean text in the new revision' do
20
+ old_text = Text.new('Your old. {{speedy deletion}}')
21
+ new_text = Text.new('Your old. My inserted.')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedSexFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of removed sex words over all removed words' do
8
+ # inserted: total 7 words, 3 sex words
9
+ old_text = Text.new('Penis old. It’s Penis man, this is penis, anal.')
10
+ new_text = Text.new('Penis old.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 7.0
17
+ end
18
+
19
+ it 'returns 0.0 on emtpy removed clean text' do
20
+ old_text = Text.new('penis old. {{speedy deletion}}')
21
+ new_text = Text.new('penis old. Penis new.')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedSize do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the size of the new removed text' do
8
+ old_text = Text.new('123 456789')
9
+ new_text = Text.new('123') # 6 removed
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 6
16
+ end
17
+
18
+ it 'returns 0 if no removed text ' do
19
+ old_text = Text.new('123')
20
+ new_text = Text.new('123 456789') # 0 removed
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 0
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedVulgarismFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of removed vulgarism over all removed words' do
8
+ # total 8 words, 3 vulgarism
9
+ old_text = Text.new('Old shit. Fuck, fu*ck you $lut, and all the others.')
10
+ new_text = Text.new('Old shit. New shit.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 8.0
17
+ end
18
+
19
+ it 'returns 0.0 on emtpy removed clean text revisions' do
20
+ old_text = Text.new('Old shit. {{speedy deletion}}')
21
+ new_text = Text.new('Old shit. New shit.')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RemovedWords do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of the edit’s removed words' do
8
+ old_text = Text.new('zero one two three four five six') # 6 removed
9
+ new_text = Text.new('zero')
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 6
16
+ end
17
+
18
+ it 'returns 0 if no text was removed' do
19
+ old_text = Text.new('zero') # 0 removed
20
+ new_text = Text.new('zero one')
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 0
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,37 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::ReplacementSimilarity do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the similarity of the deleted text to inserted in exchange' do
8
+ old_text = Text.new('this is Mr. Dixon')
9
+ new_text = Text.new('this is Mr. Dicksonx')
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 0.8133333333333332
17
+ end
18
+
19
+ it 'returns 0 if the old revision text is empty' do
20
+ old_rev = build(:old_revision, text: '')
21
+ new_rev = build(:new_revision, text: '{{speedy deletion}}')
22
+
23
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
24
+
25
+ expect(subject.calculate(edit)).to eq 0
26
+ end
27
+
28
+ it 'returns 0 if the new revision text is empty' do
29
+ old_rev = build(:old_revision, text: '{{speedy deletion}}')
30
+ new_rev = build(:new_revision, text: '')
31
+
32
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
33
+
34
+ expect(subject.calculate(edit)).to eq 0
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,24 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::Reverted do
4
+ it { is_expected.to be_a Features::ContainsBase }
5
+
6
+ describe '#calculate' do
7
+ %w[rvt rvv revert].each do |term|
8
+ it "returns 1 if the edit comment includes '#{term}'" do
9
+ comment = Text.new("#{term} edited")
10
+ new_rev = build(:new_revision, comment: comment)
11
+ edit = build(:edit, new_revision: new_rev)
12
+
13
+ expect(subject.calculate(edit)).to eq 1
14
+ end
15
+ end
16
+
17
+ it 'returns 0 for an emtpy comment' do
18
+ new_rev = build(:new_revision, comment: '')
19
+ edit = build(:edit, new_revision: new_rev)
20
+
21
+ expect(subject.calculate(edit)).to eq 0
22
+ end
23
+ end
24
+ end