wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::RevisionsCharacterDistribution do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the KL-Divergence of the inserted characters distribution' do
8
+ old_text = Text.new('old text old text is standing here')
9
+ new_text = Text.new('old text [[new inserted text]] given dero 9')
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 0.6312751553366259
16
+ end
17
+
18
+ it 'returns missing value if new revision text is empty' do
19
+ old_text = Text.new('old text')
20
+ new_text = Text.new
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
27
+ end
28
+
29
+ it 'returns missing value if the old revision text is empty' do
30
+ old_text = Text.new
31
+ new_text = Text.new('new text')
32
+
33
+ old_rev = build(:old_revision, text: old_text)
34
+ new_rev = build(:new_revision, text: new_text)
35
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
36
+
37
+ expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,71 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::SameEditor do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ context 'when both contributors are given' do
8
+ it 'returns 1.0 in case of the same previous editor' do
9
+ editor = 'Peter'
10
+ old_rev = build(:old_revision, contributor: editor)
11
+ new_rev = build(:new_revision, contributor: editor)
12
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
13
+
14
+ expect(subject.calculate(edit)).to eq 1
15
+ end
16
+
17
+ it 'returns 0.0 in case of another previous editor' do
18
+ old_rev = build(:old_revision, contributor: '137.163.16.199')
19
+ new_rev = build(:new_revision, contributor: 'Peter')
20
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
21
+
22
+ expect(subject.calculate(edit)).to eq 0
23
+ end
24
+ end
25
+
26
+ context 'when previous contributor is not present' do
27
+ context 'in case of the same previous editor' do
28
+ it 'requests the user from Wikipedia API and returns 1' do
29
+ # contributor: TOmaxer
30
+ old_rev = build(:old_revision, id: '324557983', contributor: nil)
31
+ new_rev = build(
32
+ :new_revision,
33
+ id: '329962649',
34
+ parent_id: '324557983',
35
+ contributor: 'Tomaxer'
36
+ )
37
+
38
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
39
+
40
+ expect(subject.calculate(edit)).to eq 1
41
+ end
42
+ end
43
+
44
+ context 'in case of another previous editor' do
45
+ it 'requests the user from Wikipedia API and returns 0' do
46
+ # 137.163.16.199
47
+ old_rev = build(:old_revision, id: '328774110', contributor: nil)
48
+ # ClueBot
49
+ new_rev = build(:new_revision, id: '328774035', parent_id: '328774110')
50
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
51
+
52
+ expect(subject.calculate(edit)).to eq 0
53
+ end
54
+ end
55
+
56
+ context 'if old reivision is not available anymore' do
57
+ it 'returns missing' do
58
+ # to get api call, see:
59
+ # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=timestamp&revids=325218985
60
+ # <rev revid="325218985"/>
61
+
62
+ old_rev = build(:old_revision, id: '325218985', contributor: nil)
63
+ new_rev = build(:new_revision, id: '326980599', parent_id: '325218985')
64
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
65
+
66
+ expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::SexFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of vulgarism words over all words' do
8
+ # total 6 words, 3 bad
9
+ old_text = Text.new('Old whatever.')
10
+ new_text = Text.new('Old whatever. New sex contents. Penis, dildos, boy.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 6.0
17
+ end
18
+
19
+ it 'returns 0.0 for an emtpy clean text in the new revision' do
20
+ old_text = Text.new('Old guy.')
21
+ new_text = Text.new('Old guy. {{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,53 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::SexImpact do
4
+ it { is_expected.to be_a Features::ImpactBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the impact of sex words of the new revision text' do
8
+ # 3 sex words
9
+ old_text = Text.new('Penis, old text dildo, breast it')
10
+
11
+ # 4 sex words
12
+ new_text = Text.new('Penis, old text dildo, breast anal it')
13
+
14
+ old_rev = build(:old_revision, text: old_text)
15
+ new_rev = build(:new_revision, text: new_text)
16
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
17
+
18
+ expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 4.0)
19
+ end
20
+
21
+ it 'returns 0.5 if both revision text have no terms' do
22
+ text = Text.new('{{speedy deletion}}')
23
+
24
+ old_rev = build(:old_revision, text: text)
25
+ new_rev = build(:new_revision, text: text)
26
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
27
+
28
+ expect(subject.calculate(edit)).to eq 0.5
29
+ end
30
+
31
+ it 'returns 0.0 for an emtpy clean text in the old revision' do
32
+ old_text = Text.new('{{speedy deletion}}')
33
+ new_text = Text.new('anal')
34
+
35
+ old_rev = build(:old_revision, text: old_text)
36
+ new_rev = build(:new_revision, text: new_text)
37
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
38
+
39
+ expect(subject.calculate(edit)).to eq 0.0
40
+ end
41
+
42
+ it 'returns 1.0 for an emtpy clean text in the new revision' do
43
+ old_text = Text.new('anal')
44
+ new_text = Text.new('{{speedy deletion}}')
45
+
46
+ old_rev = build(:old_revision, text: old_text)
47
+ new_rev = build(:new_revision, text: new_text)
48
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
49
+
50
+ expect(subject.calculate(edit)).to eq 1.0
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::SizeIncrement do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns a negative increment on more removed texts' do
8
+ old_rev_text = Text.new('123456789') # length 9
9
+ new_rev_text = Text.new('123') # length 3
10
+
11
+ old_rev = build(:old_revision, text: old_rev_text)
12
+ new_rev = build(:new_revision, text: new_rev_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 3.0 - 9.0
16
+ end
17
+
18
+ it 'returns a positive increment on more removed texts' do
19
+ old_rev_text = Text.new('123') # length 3
20
+ new_rev_text = Text.new('123456789') # length 9
21
+
22
+ old_rev = build(:old_revision, text: old_rev_text)
23
+ new_rev = build(:new_revision, text: new_rev_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 9.0 - 3.0
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,48 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::SizeRatio do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the ratio of the revisions text sizes' do
8
+ old_text = Text.new('123456789') # length 9
9
+ new_text = Text.new('123') # length 3
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 9.0 / (9.0 + 3.0)
16
+ end
17
+
18
+ it 'returns 1.0 for an emtpy text in the new revision' do
19
+ old_text = Text.new('sample text')
20
+ new_text = Text.new
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 1.0
27
+ end
28
+
29
+ it 'returns 0.0 for an emtpy text in the old revisions' do
30
+ old_text = Text.new
31
+ new_text = Text.new('sample text')
32
+
33
+ old_rev = build(:old_revision, text: old_text)
34
+ new_rev = build(:new_revision, text: new_text)
35
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
36
+
37
+ expect(subject.calculate(edit)).to eq 0.0
38
+ end
39
+
40
+ it 'returns 0.5 if both revision texts are empty' do
41
+ old_rev = build(:old_revision, text: Text.new)
42
+ new_rev = build(:new_revision, text: Text.new)
43
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
44
+
45
+ expect(subject.calculate(edit)).to eq 0.5
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,32 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::TermFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the average relative frequency of inserted words' do
8
+ # removed [you, I], added [you, you, you, we]
9
+ # for [you, we] compute frequency in old_text and average for all words
10
+ # here: [you] is 6x in new text, [we] is 3x in new text of total 10 words
11
+ # avg = (6/10 + 3/10)/2
12
+ old_text = Text.new("we\nwe\nyou\nyou\nyou\nI\nyou\nI\n")
13
+ new_text = Text.new("we\nwe\nyou\nyou\nI\nyou\n''(you''\nyou\nyou\n[[we]])\n")
14
+
15
+ old_rev = build(:old_revision, text: old_text)
16
+ new_rev = build(:new_revision, text: new_text)
17
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
18
+
19
+ expect(subject.calculate(edit)).to eq(((6.0 / 10.0) + (3.0 / 10.0)) / 2.0)
20
+ end
21
+
22
+ it 'returns 0.0 on emtpy clean text revisions' do
23
+ text = Text.new('{{speedy deletion}}')
24
+
25
+ old_rev = build(:old_revision, text: text)
26
+ new_rev = build(:new_revision, text: text)
27
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
28
+
29
+ expect(subject.calculate(edit)).to eq 0.0
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,56 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::TimeInterval do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns time interval in days from the old to the new revision' do
8
+ old_timestamp = '2014-11-27T18:00:00Z'
9
+ new_timestamp = '2014-11-29T06:00:00Z'
10
+
11
+ old_rev = build(:old_revision, timestamp: old_timestamp)
12
+ new_rev = build(:new_revision, timestamp: new_timestamp)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 1.5
16
+ end
17
+
18
+ it 'requests the time from API if no old revisions timestamp is given' do
19
+ # to get api call, see:
20
+ # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=timestamp&revids=331655535
21
+ # => 2001-11-29T03:27:34Z
22
+ new_timestamp = '2001-11-06T13:16:13Z'
23
+
24
+ old_rev = build(:old_revision, id: '331655534', timestamp: nil)
25
+ new_rev = build(
26
+ :new_revision,
27
+ id: '331655535',
28
+ parent_id: '331655534',
29
+ timestamp: new_timestamp
30
+ )
31
+
32
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
33
+
34
+ expect(subject.calculate(edit)).to eq 0.5
35
+ end
36
+
37
+ it 'returns missing if the old reivision is not available anymore' do
38
+ # to get api call, see:
39
+ # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=timestamp&revids=325218985
40
+ # <rev revid="325218985"/>
41
+ new_timestamp = '2011-11-11T01:00:00Z'
42
+
43
+ old_rev = build(:old_revision, id: '325218985', timestamp: nil)
44
+ new_rev = build(
45
+ :new_revision,
46
+ id: '326980599',
47
+ parent_id: '325218985',
48
+ timestamp: new_timestamp
49
+ )
50
+
51
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
52
+
53
+ expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,16 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::TimeOfDay do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the time of day as decimal value (hours)' do
8
+ old_rev = build(:old_revision)
9
+ new_rev = build(:new_revision, timestamp: '2012-12-09T05:30:36Z')
10
+
11
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
12
+
13
+ expect(subject.calculate(edit)).to eq 5.6
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::UpperCaseRatio do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the uppercase to all letters ratio of inserted clean text' do
8
+ old_text = Text.new('text')
9
+ # 3 uppercase letters of total 4 inserted letters
10
+ new_text = Text.new('text [[1A 4B6 8Cd]]')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq((1.0 + 3) / (1.0 + 4))
17
+ end
18
+
19
+ it 'returns 0.0 if no text was inserted' do
20
+ old_text = Text.new('deletion text')
21
+ new_text = Text.new('text')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,33 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Wikipedia::VandalismDetection::Features::UpperCaseWordsRatio do
6
+ it { is_expected.to be_a Features::Base }
7
+
8
+ describe '#calculate' do
9
+ it 'returns the uppercase/all words ratio of the inserted cleaned text' do
10
+ old_text = Text.new('text')
11
+ # 2 two capital (not numbers!) words of total 4 inserted.
12
+ # The template {{23A}} is removed while cleaning.
13
+ new_text = Text.new('text [[HELLO you]] NICE boyß3 1990 {{23A}}')
14
+
15
+ old_rev = build(:old_revision, text: old_text)
16
+ new_rev = build(:new_revision, text: new_text)
17
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
18
+
19
+ expect(subject.calculate(edit)).to eq((1.0 + 2) / (1.0 + 4))
20
+ end
21
+
22
+ it 'returns 0.0 if no text inserted' do
23
+ old_text = Text.new('DELECTION text')
24
+ new_text = Text.new('text')
25
+
26
+ old_rev = build(:old_revision, text: old_text)
27
+ new_rev = build(:new_revision, text: new_text)
28
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
29
+
30
+ expect(subject.calculate(edit)).to eq 0.0
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::UpperToLowerCaseRatio do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the upper to lower letters ratio of the inserted text' do
8
+ old_text = Text.new('text')
9
+ # 3 uppercase letters, 4 lowercase letters
10
+ new_text = Text.new('text [[1aA 4B6 8Cd ef]]')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq((1.0 + 3) / (1.0 + 4))
17
+ end
18
+
19
+ it 'returns 0.0 if no text inserted' do
20
+ old_text = Text.new('deletion text')
21
+ new_text = Text.new('text')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::VulgarismFrequency do
4
+ it { is_expected.to be_a Features::FrequencyBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the number of vulgarism words over all inserted words' do
8
+ # total 8 inserted words, 3 vulgarism
9
+ old_text = Text.new('Old shit.')
10
+ new_text = Text.new('Old shit. Fuck, fu*ck you $lut, and all the others.')
11
+
12
+ old_rev = build(:old_revision, text: old_text)
13
+ new_rev = build(:new_revision, text: new_text)
14
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
15
+
16
+ expect(subject.calculate(edit)).to eq 3.0 / 8.0
17
+ end
18
+
19
+ it 'returns 0.0 on emtpy clean text revisions' do
20
+ old_text = Text.new('Old shit.')
21
+ new_text = Text.new('Old shit. {{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: old_text)
24
+ new_rev = build(:new_revision, text: new_text)
25
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.0
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,52 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::VulgarismImpact do
4
+ it { is_expected.to be_a Features::ImpactBase }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the impact of vulgarism words of the new revision text' do
8
+ # 1 vulgarism
9
+ old_text = Text.new('Fuck you, you and all the others')
10
+ # 3 vulgarism
11
+ new_text = Text.new('Fuck you, fuck you, you and all the others sluts')
12
+
13
+ old_rev = build(:old_revision, text: old_text)
14
+ new_rev = build(:new_revision, text: new_text)
15
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
16
+
17
+ expect(subject.calculate(edit)).to eq 1.0 / (1.0 + 3.0)
18
+ end
19
+
20
+ it 'returns 0.5 when both revision texts have no terms' do
21
+ text = Text.new('{{speedy deletion}}')
22
+
23
+ old_rev = build(:old_revision, text: text)
24
+ new_rev = build(:new_revision, text: text)
25
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
26
+
27
+ expect(subject.calculate(edit)).to eq 0.5
28
+ end
29
+
30
+ it 'returns 0.0 for an emtpy clean text of the old revision' do
31
+ old_text = Text.new('{{speedy deletion}}')
32
+ new_text = Text.new('fuck')
33
+
34
+ old_rev = build(:old_revision, text: old_text)
35
+ new_rev = build(:new_revision, text: new_text)
36
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
37
+
38
+ expect(subject.calculate(edit)).to eq 0.0
39
+ end
40
+
41
+ it 'returns 1.0 for an emtpy clean text of the new revision' do
42
+ old_text = Text.new('fuck')
43
+ new_text = Text.new('{{speedy deletion}}')
44
+
45
+ old_rev = build(:old_revision, text: old_text)
46
+ new_rev = build(:new_revision, text: new_text)
47
+ edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
48
+
49
+ expect(subject.calculate(edit)).to eq 1.0
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,16 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::Weekday do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns the weekday as decimal value' do
8
+ old_rev = build(:old_revision)
9
+ new_rev = build(:new_revision, timestamp: '2012-12-11T05:30:36Z')
10
+
11
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
12
+
13
+ expect(subject.calculate(edit)).to eq 2 # Thuesday
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Features::WordsIncrement do
4
+ it { is_expected.to be_a Features::Base }
5
+
6
+ describe '#calculate' do
7
+ it 'returns a negative increment for more removed texts' do
8
+ old_text = Text.new('one two three four five six seven eight') # length 8
9
+ new_text = Text.new('one two three') # length 3
10
+
11
+ old_rev = build(:old_revision, text: old_text)
12
+ new_rev = build(:new_revision, text: new_text)
13
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
14
+
15
+ expect(subject.calculate(edit)).to eq 3.0 - 8.0
16
+ end
17
+
18
+ it 'returns a positive increment on more removed texts' do
19
+ old_text = Text.new('one two three') # length 3
20
+ new_text = Text.new('one two three four five six seven eight') # length 8
21
+
22
+ old_rev = build(:old_revision, text: old_text)
23
+ new_rev = build(:new_revision, text: new_text)
24
+ edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
25
+
26
+ expect(subject.calculate(edit)).to eq 8.0 - 3.0
27
+ end
28
+ end
29
+ end