wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,19 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the size of removed text in the edit's new
7
+ # revision.
8
+ class RemovedSize < Base
9
+ # Returns the size of removed character in the new revision.
10
+ def calculate(edit)
11
+ super
12
+
13
+ size = edit.removed_text.size
14
+ size > 0 ? size : 0
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/vulgarism'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes frequency of vulgarism words in the removed text.
9
+ class RemovedVulgarismFrequency < FrequencyBase
10
+ # Returns the percentage of vulgarism words in the removed text.
11
+ # Returns 0.0 if removed clean text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.removed_words.join("\n")).clean
16
+ frequency(text, WordLists::VULGARISM)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,17 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the number of removed words in the edit's new
7
+ # revision.
8
+ class RemovedWords < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ edit.removed_words.count
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,20 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/diff'
3
+ require 'hotwater'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes the similarity of deleted to inserted text.
9
+ # As similarity measure the Jaro-Winkler distance is used.
10
+ # See: http://courses.cs.washington.edu/courses/cse590q/04au/papers/Winkler99.pdf
11
+ class ReplacementSimilarity < Base
12
+ def calculate(edit)
13
+ super
14
+
15
+ ::Hotwater.jaro_winkler_distance(edit.removed_text, edit.inserted_text)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,16 @@
1
+ require 'wikipedia/vandalism_detection/features/contains_base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature returns whether the edit's comment includes 'copyedit'.
7
+ class Copyedit < ContainsBase
8
+ def calculate(edit)
9
+ super
10
+
11
+ contains(edit.new_revision.comment, ['copyedit', 'copy edit'])
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,25 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/algorithms'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the Kullback-Leibler Divergence of the old and
8
+ # new text's character distribution.
9
+ # The smaller the divergence, the higher the similarity of the
10
+ # distributions and conversely.
11
+ class RevisionsCharacterDistribution < Base
12
+ include Algorithms
13
+
14
+ def calculate(edit)
15
+ super
16
+
17
+ kullback_leibler_divergence(
18
+ edit.old_revision.text,
19
+ edit.new_revision.text
20
+ )
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,32 @@
1
+ require_relative 'base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature describes whether the contributor of the new revision is
7
+ # the same as the editor of the old revision.
8
+ class SameEditor < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ old_revision = edit.old_revision
13
+
14
+ if old_revision.contributor.blank?
15
+ xml = Wikipedia.api_request(
16
+ prop: 'revisions',
17
+ rvprop: 'user',
18
+ revids: old_revision.id
19
+ )
20
+
21
+ contributor = xml.xpath('//rev/@user').text
22
+ return Features::MISSING_VALUE if contributor.blank?
23
+
24
+ old_revision.contributor = contributor
25
+ end
26
+
27
+ old_revision.contributor == edit.new_revision.contributor ? 1 : 0
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/sex'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes frequency of bad words in the inserted text.
9
+ class SexFrequency < FrequencyBase
10
+ # Returns the percentage of biased words in the inserted text.
11
+ # Returns 0.0 if inserted clean text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.inserted_words.join("\n")).clean
16
+ frequency(text, WordLists::SEX)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ require 'wikipedia/vandalism_detection/features/impact_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/sex'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the percentage by which the edit increases the
8
+ # number of sex words in the text.
9
+ class SexImpact < ImpactBase
10
+ def calculate(edit)
11
+ super
12
+ old_text = edit.old_revision.text.clean
13
+ new_text = edit.new_revision.text.clean
14
+
15
+ impact(old_text, new_text, WordLists::SEX)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,20 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the increment of the edit's revisions text length.
7
+ class SizeIncrement < Base
8
+ # computation: |new| - |old|
9
+ def calculate(edit)
10
+ super
11
+
12
+ old_size = edit.old_revision.text.size
13
+ new_size = edit.new_revision.text.size
14
+
15
+ new_size - old_size
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,28 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the ratio of the edit's revisions text length.
7
+ class SizeRatio < Base
8
+ # Returns the ration of new text length to old text length:
9
+ # returns 0.0 for empty old revision text,
10
+ # returns 1.0 for empty new revision text,
11
+ # returns 0.5 for both revision texts empty or same size
12
+ # computation: old / old + new
13
+ def calculate(edit)
14
+ super
15
+
16
+ old_size = edit.old_revision.text.size.to_f
17
+ new_size = edit.new_revision.text.size.to_f
18
+
19
+ if old_size.zero? && new_size.zero?
20
+ 0.5
21
+ else
22
+ old_size / (old_size + new_size)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,31 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/text'
3
+ require 'wikipedia/vandalism_detection/diff'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes average frequency of words inserted in the new
9
+ # revision relative to the words in the old revision.
10
+ class TermFrequency < FrequencyBase
11
+ def calculate(edit)
12
+ super
13
+
14
+ new_text = edit.new_revision.text
15
+ inserted_terms = Text.new(edit.inserted_words.join("\n"))
16
+ .clean.gsub(/[^\w\s]/, '').split.uniq
17
+
18
+ summed_frequencies = inserted_terms.reduce(0) do |count, term|
19
+ count + frequency(new_text.clean, term)
20
+ end
21
+
22
+ if inserted_terms.count > 0
23
+ summed_frequencies / inserted_terms.count
24
+ else
25
+ 0.0
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,38 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'date'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the time interval in days between old and new
8
+ # revision.
9
+ class TimeInterval < Base
10
+ def calculate(edit)
11
+ super
12
+
13
+ new_time = DateTime.parse(edit.new_revision.timestamp)
14
+ old_timestamp = timestamp_for(edit.old_revision)
15
+
16
+ return Features::MISSING_VALUE unless old_timestamp
17
+ old_time = DateTime.parse(old_timestamp)
18
+
19
+ (new_time - old_time).to_f.abs
20
+ end
21
+
22
+ private
23
+
24
+ def timestamp_for(revision)
25
+ return revision.timestamp if revision.timestamp.present?
26
+
27
+ xml = Wikipedia.api_request(
28
+ prop: 'revisions',
29
+ rvprop: 'timestamp',
30
+ revids: revision.id
31
+ )
32
+
33
+ xml.xpath('//rev/@timestamp').text.presence
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'date'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature calculates the time of day of new revision edit as decimal
8
+ # value .
9
+ class TimeOfDay < Base
10
+ def calculate(edit)
11
+ super
12
+
13
+ timestamp = edit.new_revision.timestamp
14
+ time = DateTime.parse(timestamp)
15
+
16
+ time.hour.to_f + time.min / 60.0 + time.sec / 360.0
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the uppercase to all letters ratio of the edit's
7
+ # new revision inserted text.
8
+ class UpperCaseRatio < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ text = edit.inserted_text
13
+ return 0.0 if text.empty?
14
+
15
+ uppercase_count = text.scan(/[[:upper:]]/).size
16
+ all_letters_count = text.scan(/[[:alpha:]]/).size
17
+
18
+ (1.0 + uppercase_count) / (1.0 + all_letters_count)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,33 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'wikipedia/vandalism_detection/features/base'
4
+ require 'wikipedia/vandalism_detection/text'
5
+
6
+ module Wikipedia
7
+ module VandalismDetection
8
+ module Features
9
+ # This feature computes the uppercase to all words ratio of the edit's new
10
+ # revision inserted text.
11
+ class UpperCaseWordsRatio < Base
12
+ def calculate(edit)
13
+ super
14
+
15
+ inserted_alpha_text = edit.inserted_words
16
+ .delete_if { |w| w.gsub(/[^A-Za-z]/, '').empty? }
17
+ .join("\n")
18
+
19
+ words = Text.new(inserted_alpha_text).clean.gsub(/[^\w\s]/, '').split
20
+
21
+ return 0.0 if words.empty?
22
+
23
+ uppercase_words_count = words.reduce(0) do |count, word|
24
+ count += 1 if word == word.upcase
25
+ count
26
+ end
27
+
28
+ (1.0 + uppercase_words_count) / (1.0 + words.count)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,23 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the uppercase to all letters ratio of the edit's
7
+ # new revision inserted text.
8
+ class UpperToLowerCaseRatio < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ text = edit.inserted_text
13
+ return 0.0 if text.empty?
14
+
15
+ uppercase_count = text.scan(/[[:upper:]]/).size
16
+ lowercase_count = text.scan(/[[:lower:]]/).size
17
+
18
+ (1.0 + uppercase_count) / (1.0 + lowercase_count)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/vulgarism'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes frequency of vulgarism words in the inserted text.
9
+ class VulgarismFrequency < FrequencyBase
10
+ # Returns the percentage of vulgarism words in the inserted text.
11
+ # Returns 0.0 if inserted clean text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.inserted_words.join("\n")).clean
16
+ frequency(text, WordLists::VULGARISM)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/impact_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/vulgarism'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the percentage by which the edit increases the
8
+ # number of vulgarism words in the text.
9
+ class VulgarismImpact < ImpactBase
10
+ def calculate(edit)
11
+ super
12
+
13
+ old_text = edit.old_revision.text.clean
14
+ new_text = edit.new_revision.text.clean
15
+
16
+ impact(old_text, new_text, WordLists::VULGARISM)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'date'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature calculates the weekday of new revision edit as numeric
8
+ # value. Monday => 1, Thuesday => 2, etc.
9
+ class Weekday < Base
10
+ def calculate(edit)
11
+ super
12
+
13
+ timestamp = edit.new_revision.timestamp
14
+ DateTime.parse(timestamp).wday
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,20 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the increment of the edit's revisions words.
7
+ class WordsIncrement < Base
8
+ # computation: |inserted| - |removed|
9
+ def calculate(edit)
10
+ super
11
+
12
+ inserted_count = edit.inserted_words.count
13
+ removed_count = edit.removed_words.count
14
+
15
+ inserted_count - removed_count
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,66 @@
1
+ require 'wikipedia/vandalism_detection/features/all_wordlists_frequency'
2
+ require 'wikipedia/vandalism_detection/features/all_wordlists_impact'
3
+ require 'wikipedia/vandalism_detection/features/anonymity'
4
+ require 'wikipedia/vandalism_detection/features/anonymity_previous'
5
+ require 'wikipedia/vandalism_detection/features/article_size'
6
+ require 'wikipedia/vandalism_detection/features/bad_frequency'
7
+ require 'wikipedia/vandalism_detection/features/bad_impact'
8
+ require 'wikipedia/vandalism_detection/features/biased_frequency'
9
+ require 'wikipedia/vandalism_detection/features/biased_impact'
10
+ require 'wikipedia/vandalism_detection/features/blanking'
11
+ require 'wikipedia/vandalism_detection/features/character_diversity'
12
+ require 'wikipedia/vandalism_detection/features/character_sequence'
13
+ require 'wikipedia/vandalism_detection/features/comment_length'
14
+ require 'wikipedia/vandalism_detection/features/comment_bad_frequency'
15
+ require 'wikipedia/vandalism_detection/features/comment_biased_frequency'
16
+ require 'wikipedia/vandalism_detection/features/comment_markup_frequency'
17
+ require 'wikipedia/vandalism_detection/features/comment_pronoun_frequency'
18
+ require 'wikipedia/vandalism_detection/features/comment_sex_frequency'
19
+ require 'wikipedia/vandalism_detection/features/comment_vulgarism_frequency'
20
+ require 'wikipedia/vandalism_detection/features/compressibility'
21
+ require 'wikipedia/vandalism_detection/features/copyedit'
22
+ require 'wikipedia/vandalism_detection/features/digit_ratio'
23
+ require 'wikipedia/vandalism_detection/features/edits_per_user'
24
+ require 'wikipedia/vandalism_detection/features/emoticons_frequency'
25
+ require 'wikipedia/vandalism_detection/features/emoticons_impact'
26
+ require 'wikipedia/vandalism_detection/features/inserted_size'
27
+ require 'wikipedia/vandalism_detection/features/inserted_words'
28
+ require 'wikipedia/vandalism_detection/features/inserted_character_distribution'
29
+ require 'wikipedia/vandalism_detection/features/inserted_external_links'
30
+ require 'wikipedia/vandalism_detection/features/inserted_internal_links'
31
+ require 'wikipedia/vandalism_detection/features/longest_word'
32
+ require 'wikipedia/vandalism_detection/features/markup_frequency'
33
+ require 'wikipedia/vandalism_detection/features/markup_impact'
34
+ require 'wikipedia/vandalism_detection/features/non_alphanumeric_ratio'
35
+ require 'wikipedia/vandalism_detection/features/personal_life'
36
+ require 'wikipedia/vandalism_detection/features/pronoun_frequency'
37
+ require 'wikipedia/vandalism_detection/features/pronoun_impact'
38
+ require 'wikipedia/vandalism_detection/features/removed_all_wordlists_frequency'
39
+ require 'wikipedia/vandalism_detection/features/removed_bad_frequency'
40
+ require 'wikipedia/vandalism_detection/features/removed_biased_frequency'
41
+ require 'wikipedia/vandalism_detection/features/removed_character_distribution'
42
+ require 'wikipedia/vandalism_detection/features/removed_emoticons_frequency'
43
+ require 'wikipedia/vandalism_detection/features/removed_markup_frequency'
44
+ require 'wikipedia/vandalism_detection/features/removed_pronoun_frequency'
45
+ require 'wikipedia/vandalism_detection/features/removed_sex_frequency'
46
+ require 'wikipedia/vandalism_detection/features/removed_vulgarism_frequency'
47
+ require 'wikipedia/vandalism_detection/features/removed_size'
48
+ require 'wikipedia/vandalism_detection/features/removed_words'
49
+ require 'wikipedia/vandalism_detection/features/replacement_similarity'
50
+ require 'wikipedia/vandalism_detection/features/reverted'
51
+ require 'wikipedia/vandalism_detection/features/revisions_character_distribution'
52
+ require 'wikipedia/vandalism_detection/features/same_editor'
53
+ require 'wikipedia/vandalism_detection/features/sex_frequency'
54
+ require 'wikipedia/vandalism_detection/features/sex_impact'
55
+ require 'wikipedia/vandalism_detection/features/size_increment'
56
+ require 'wikipedia/vandalism_detection/features/size_ratio'
57
+ require 'wikipedia/vandalism_detection/features/term_frequency'
58
+ require 'wikipedia/vandalism_detection/features/time_interval'
59
+ require 'wikipedia/vandalism_detection/features/time_of_day'
60
+ require 'wikipedia/vandalism_detection/features/upper_case_ratio'
61
+ require 'wikipedia/vandalism_detection/features/upper_case_words_ratio'
62
+ require 'wikipedia/vandalism_detection/features/upper_to_lower_case_ratio'
63
+ require 'wikipedia/vandalism_detection/features/vulgarism_frequency'
64
+ require 'wikipedia/vandalism_detection/features/vulgarism_impact'
65
+ require 'wikipedia/vandalism_detection/features/weekday'
66
+ require 'wikipedia/vandalism_detection/features/words_increment'