wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,23 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the digit to all letters ratio of the edit's new
7
+ # revision inserted text.
8
+ class DigitRatio < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ text = edit.inserted_text
13
+ return 0.0 if text.empty?
14
+
15
+ all_letters_count = text.scan(/[[:alnum:]]/).size
16
+ digit_count = text.scan(/[[:digit:]]/).size
17
+
18
+ (1.0 + digit_count) / (1.0 + all_letters_count)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,72 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'date'
5
+
6
+ module Wikipedia
7
+ module VandalismDetection
8
+ module Features
9
+ # This feature calculates the number of submitted edits by the same editor
10
+ # (IP or ID) as the edit's editor.
11
+ class EditsPerUser < Base
12
+ # Returns the number of edits the edit's editor made in the same
13
+ # article. Attention: This is pretty time consuming (~2sec) due to the
14
+ # url request.
15
+ def calculate(edit)
16
+ super
17
+
18
+ revision = edit.new_revision
19
+ page = edit.page
20
+
21
+ if page && page.id
22
+ edits_count_from_page(edit)
23
+ else
24
+ edits_count_from_api_request(revision)
25
+ end
26
+ end
27
+
28
+ protected
29
+
30
+ def edits_count_from_page(edit)
31
+ edit_revision = edit.new_revision
32
+
33
+ edit.page.edits.reduce(0) do |count, page_edit|
34
+ page_revision = page_edit.new_revision
35
+
36
+ same_user = page_revision.contributor == edit_revision.contributor
37
+ diff = time_diff(page_revision.timestamp, edit_revision.timestamp)
38
+
39
+ count += 1 if same_user && diff < 0
40
+ count
41
+ end
42
+ end
43
+
44
+ def edits_count_from_api_request(revision)
45
+ params = {
46
+ list: 'usercontribs',
47
+ ucuser: revision.contributor,
48
+ ucprop: 'ids|timestamp'
49
+ }
50
+
51
+ xml = Wikipedia.api_request(params)
52
+
53
+ page_item = xml.xpath("//item[@revid='#{revision.id}']").first
54
+ return 0 unless page_item
55
+
56
+ page_id = page_item.xpath('@pageid').text
57
+
58
+ # count only edits before current
59
+ xml.xpath("//item[@pageid='#{page_id}']").reduce(0) do |count, item|
60
+ time = item.attr('timestamp')
61
+ count += 1 if time_diff(time, revision.timestamp) < 0
62
+ count
63
+ end
64
+ end
65
+
66
+ def time_diff(time1, time2)
67
+ ((DateTime.parse(time1) - DateTime.parse(time2)) * 24 * 60 * 60).to_i
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,27 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/word_lists/emoticons'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes frequency of emoticon words in the inserted text.
8
+ class EmoticonsFrequency < Base
9
+ EMOJI_REGEX = /(^|\s)(#{WordLists::EMOTICONS.join('|')})(?=\s|$|\Z|[\.,!?]\s|[\.!?]\Z)/
10
+
11
+ # Returns the percentage of emoticon words in the inserted text.
12
+ # Returns 0.0 if inserted clean text is of zero length.
13
+ def calculate(edit)
14
+ super
15
+
16
+ inserted_text = edit.inserted_text
17
+ emoticons_count = inserted_text.scan(EMOJI_REGEX).flatten
18
+ .reject { |c| c.size < 2 }.count
19
+
20
+ total_count = inserted_text.split.count
21
+
22
+ total_count > 0 ? emoticons_count.to_f / total_count.to_f : 0.0
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,29 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/word_lists/emoticons'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes impact of emoticons words in the inserted text.
9
+ class EmoticonsImpact < Base
10
+ EMOJI_REGEX = /(^|\s)(#{WordLists::EMOTICONS.join('|')})(?=\s|$|\Z|[\.,!?]\s|[\.!?]\Z)/
11
+
12
+ def calculate(edit)
13
+ super
14
+
15
+ old_text = edit.old_revision.text
16
+ new_text = edit.new_revision.text
17
+
18
+ old_count = old_text.scan(EMOJI_REGEX).flatten
19
+ .reject { |c| c.size < 2 }.count.to_f
20
+ new_count = new_text.scan(EMOJI_REGEX).flatten
21
+ .reject { |c| c.size < 2 }.count.to_f
22
+
23
+ no_terms_in_both = old_count.zero? && new_count.zero?
24
+ no_terms_in_both ? 0.5 : old_count / (old_count + new_count)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,18 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ class FrequencyBase < Base
7
+ # Returns the ratio of given numbers.
8
+ # For frequency calculation it returns 0.0 if total_count is zero.
9
+ def frequency(text, terms)
10
+ total_count = text.split.count
11
+ term_count = count terms, in: text
12
+
13
+ total_count > 0 ? term_count.to_f / total_count.to_f : 0.0
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,25 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ class ImpactBase < Base
7
+ # Returns the ratio of given text's terms count.
8
+ # For impact calculation it returns 0.5 if number of terms in old text
9
+ # is zero.
10
+ def impact(old_text, new_text, terms)
11
+ old_terms_count = (count terms, in: old_text).to_f
12
+ new_terms_count = (count terms, in: new_text).to_f
13
+
14
+ no_terms_in_both = old_terms_count.zero? && new_terms_count.zero?
15
+
16
+ if no_terms_in_both
17
+ 0.5
18
+ else
19
+ old_terms_count / (old_terms_count + new_terms_count)
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,23 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/algorithms'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the Kullback-Leibler Divergence of the inserted
8
+ # text's character distribution
9
+ # relative to the character distribution of the old revision's text.
10
+ # The smaller the divergence, the higher the similarity of the
11
+ # distributions and conversely.
12
+ class InsertedCharacterDistribution < Base
13
+ include Algorithms
14
+
15
+ def calculate(edit)
16
+ super
17
+
18
+ kullback_leibler_divergence(edit.old_revision.text, edit.inserted_text)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,19 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the number of inserted external links of format
7
+ # [http://example.com].
8
+ class InsertedExternalLinks < Base
9
+ URL_REGEX = %r{\[?(https?|ftp)\s?:\s?\/\/[^\s\/$.?#].[^\s]*]?}i
10
+
11
+ def calculate(edit)
12
+ super
13
+
14
+ edit.inserted_text.scan(URL_REGEX).count
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,18 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the number of inserted internal links [[link]].
7
+ class InsertedInternalLinks < Base
8
+ INTERNAL_LINK_REGEX = /\[{2}([^\[].*?)\]{2}/
9
+
10
+ def calculate(edit)
11
+ super
12
+
13
+ edit.inserted_text.scan(INTERNAL_LINK_REGEX).count
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,19 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the size of inserted text in the edit's new
7
+ # revision.
8
+ class InsertedSize < Base
9
+ # Returns the size of inserted character in the new revision.
10
+ def calculate(edit)
11
+ super
12
+
13
+ size = edit.inserted_text.size
14
+ size < 0 ? 0 : size
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,17 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the number of inserted words in the edit's new
7
+ # revision.
8
+ class InsertedWords < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ edit.inserted_words.count
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/text'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the length of the longest word in the inserted
8
+ # text.
9
+ class LongestWord < Base
10
+ def calculate(edit)
11
+ super
12
+
13
+ sequence_hash = Text.new(edit.inserted_words.join("\n"))
14
+ .clean.split(/[\b\s+,;:]/).group_by(&:length)
15
+
16
+ sequence_hash.empty? ? 0 : sequence_hash.max.first
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,29 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/word_lists/markup'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes frequency of bad words in the inserted text.
8
+ class MarkupFrequency < Base
9
+ MARKUP_REGEX = /(#{WordLists::MARKUP.join('|')})/
10
+
11
+ # Returns the percentage of markup related words in the inserted text.
12
+ # Returns 0.0 if inserted clean text is of zero length.
13
+ def calculate(edit)
14
+ super
15
+
16
+ text = edit.inserted_text
17
+ all_words_count = edit.inserted_words.count
18
+ markup_words_count = text.scan(MARKUP_REGEX).count
19
+
20
+ if all_words_count > 0
21
+ markup_words_count.to_f / all_words_count.to_f
22
+ else
23
+ 0.0
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,30 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/word_lists/markup'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the percentage by which the edit increases the
8
+ # number of markup words in the text.
9
+ class MarkupImpact < Base
10
+ MARKUP_REGEX = /(#{WordLists::MARKUP.join('|')})/
11
+
12
+ def calculate(edit)
13
+ super
14
+
15
+ old_text = edit.old_revision.text
16
+ new_text = edit.new_revision.text
17
+
18
+ old_markup_count = old_text.scan(MARKUP_REGEX).count.to_f
19
+ new_markup_count = new_text.scan(MARKUP_REGEX).count.to_f
20
+
21
+ if old_markup_count.zero? && new_markup_count.zero?
22
+ 0.5
23
+ else
24
+ old_markup_count / (old_markup_count + new_markup_count)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,23 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the non-alphanumeric to all letters ratio of the
7
+ # edit's new revision inserted text.
8
+ class NonAlphanumericRatio < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ text = edit.inserted_text
13
+ return 0.0 if text.empty?
14
+
15
+ non_alpha_count = text.scan(/[^a-zA-Z0-9\s]/).size
16
+ all_letters_count = text.scan(/[^\s]/).size
17
+
18
+ (1.0 + non_alpha_count) / (1.0 + all_letters_count)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,17 @@
1
+ require 'wikipedia/vandalism_detection/features/contains_base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature returns whether the edit's comment includes
7
+ # 'personal life'.
8
+ class PersonalLife < ContainsBase
9
+ def calculate(edit)
10
+ super
11
+
12
+ contains(edit.new_revision.comment, 'personal life')
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/pronouns'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes the frequency of pronouns in the insetred text.
9
+ class PronounFrequency < FrequencyBase
10
+ # Returns the percentage of pronoun words in the inserted text.
11
+ # Returns 0.0 if inserted clean text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.inserted_words.join("\n")).clean
16
+ frequency(text, WordLists::PRONOUNS)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ require 'wikipedia/vandalism_detection/features/impact_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/pronouns'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the percentage by which the edit increases the
8
+ # number of pronouns in the text.
9
+ class PronounImpact < ImpactBase
10
+ def calculate(edit)
11
+ super
12
+ old_text = edit.old_revision.text.clean
13
+ new_text = edit.new_revision.text.clean
14
+
15
+ impact(old_text, new_text, WordLists::PRONOUNS)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes frequency of all wordlists words in the removed
9
+ # text.
10
+ class RemovedAllWordlistsFrequency < FrequencyBase
11
+ # Returns the percentage of wordlists words in the removed text.
12
+ # Returns 0.0 if removed clean text is of zero length.
13
+ def calculate(edit)
14
+ super
15
+
16
+ text = Text.new(edit.removed_words.join("\n")).clean
17
+ frequency(text, WordLists.all)
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/sex'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes the frequency of bad words in the removed text.
9
+ class RemovedBadFrequency < FrequencyBase
10
+ # Returns the percentage of bad words in the removed text.
11
+ # Returns 0.0 if cleaned removed text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.removed_words.join("\n")).clean
16
+ frequency(text, WordLists::BAD)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/biased'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes frequency of biased words in the removed text.
9
+ class RemovedBiasedFrequency < FrequencyBase
10
+ # Returns the percentage of biased words in the removed text.
11
+ # Returns 0.0 if removed clean text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.removed_words.join("\n")).clean
16
+ frequency(text, WordLists::BIASED)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/algorithms'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the Kullback-Leibler Divergence of the removed
8
+ # text's character distribution relative to the character distribution
9
+ # of the new revision's text.
10
+ # The smaller the divergence, the higher the similarity of the
11
+ # distributions and conversely.
12
+ class RemovedCharacterDistribution < Base
13
+ include Algorithms
14
+
15
+ def calculate(edit)
16
+ super
17
+
18
+ kullback_leibler_divergence(edit.new_revision.text, edit.removed_text)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,28 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/word_lists/emoticons'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the frequency of emoticon words in the removed
8
+ # text.
9
+ class RemovedEmoticonsFrequency < Base
10
+ # Returns the percentage of markup words in the removed text.
11
+ # Returns 0.0 if cleaned removed text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ removed_text = edit.removed_text
16
+ emojis = WordLists::EMOTICONS.join('|')
17
+ regex = /(^|\s)(#{emojis})(?=\s|$|\Z|[\.,!?]\s|[\.!?]\Z)/
18
+
19
+ emoticons_count = removed_text.scan(regex).flatten
20
+ .reject { |c| c.size < 2 }.count
21
+ total_count = removed_text.split.count
22
+
23
+ total_count > 0 ? emoticons_count.to_f / total_count.to_f : 0.0
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,30 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/word_lists/markup'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes the frequency of markup words in the removed text.
9
+ class RemovedMarkupFrequency < Base
10
+ MARKUP_REGEX = /(#{WordLists::MARKUP.join('|')})/
11
+
12
+ # Returns the percentage of markup words in the removed text.
13
+ # Returns 0.0 if cleaned removed text is of zero length.
14
+ def calculate(edit)
15
+ super
16
+
17
+ text = edit.removed_text
18
+ all_words_count = edit.removed_words.count
19
+ markup_words_count = text.scan(MARKUP_REGEX).count
20
+
21
+ if all_words_count > 0
22
+ markup_words_count.to_f / all_words_count.to_f
23
+ else
24
+ 0.0
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/pronouns'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes the frequency of pronouns in the removed text.
9
+ class RemovedPronounFrequency < FrequencyBase
10
+ # Returns the percentage of pronoun words in the removed text.
11
+ # Returns 0.0 if cleaned removed text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.removed_words.join("\n")).clean
16
+ frequency(text, WordLists::PRONOUNS)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/sex'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes the frequency of sex words in the removed text.
9
+ class RemovedSexFrequency < FrequencyBase
10
+ # Returns the percentage of sex words in the removed text.
11
+ # Returns 0.0 if cleaned removed text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.removed_words.join("\n")).clean
16
+ frequency(text, WordLists::SEX)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end