wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: bf756c5448798deaecad9dff7f1158124f1665eae7f65e6e3cd1c018dcb4b273
4
+ data.tar.gz: ec45e4a4a402eb9dadada7570f094cd5be294634da3e31ce28603bd48666e74c
5
+ SHA512:
6
+ metadata.gz: a72ec32117e19bbac2764eb01022f608c4eb91121e6d552c1a05a230b559a5279e51fe8e7970b48667d6450ebb0b23fc36338ade74bb47d729018fbdb4b39868
7
+ data.tar.gz: 8eb0fb8fe4d2e0ed681543cf0a76dd9a806253cf8e43ce2dd224137ad0970d1f7e9f84caf2b1fd22f289d3553414e449799d5f50c010d435ac2d6a3d5afa4a93
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ spec/resources/build
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ /config/*.yml
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --force-color
2
+ --order rand
data/.rubocop.yml ADDED
@@ -0,0 +1,35 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.4
3
+ Exclude:
4
+ - 'bin/**/*'
5
+ - '*.gemspec'
6
+ - 'Gemfile'
7
+ - 'Gemfile.lock'
8
+
9
+ Style/Copyright:
10
+ Enabled: false
11
+
12
+ Style/Documentation:
13
+ Enabled: false
14
+
15
+ Metrics/LineLength:
16
+ Max: 80
17
+ Exclude:
18
+ - '**/*_spec.rb'
19
+ - 'spec/factories/*.rb'
20
+
21
+ Layout/MultilineMethodCallIndentation:
22
+ EnforcedStyle: indented
23
+
24
+ Style/FrozenStringLiteralComment:
25
+ Enabled: false
26
+
27
+ Metrics/ModuleLength:
28
+ Exclude:
29
+ - '**/*_spec.rb'
30
+ - 'spec/factories/*.rb'
31
+
32
+ Metrics/BlockLength:
33
+ Exclude:
34
+ - '**/*_spec.rb'
35
+ - 'spec/factories/*.rb'
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - jruby-9.1.0.0
5
+ - jruby-9.2.0.0
6
+ - jruby-head
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in wikipedia-vandalism_detection.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,4 @@
1
+ Copyright (c) 2014-2018 Paul Götze
2
+
3
+ This software is licensed under the GPL v3.
4
+ For further information and the full license text see: http://www.gnu.org/licenses/gpl-3.0.en.html
data/README.md ADDED
@@ -0,0 +1,288 @@
1
+ # Wikipedia Vandalism Detection
2
+
3
+ Vandalism detection on the Wikipedia history with JRuby v9.1.0.0+.
4
+
5
+ The Wikipedia Vandalism Detection Gem uses the Weka Machine-Learning Library
6
+ via the [weka](https://github.com/paulgoetze/weka-jruby) gem.
7
+
8
+ [![Gem Version](https://badge.fury.io/rb/wikipedia-vandalism_detection.svg)](http://badge.fury.io/rb/wikipedia-vandalism_detection)
9
+ [![Build Status](https://travis-ci.org/paulgoetze/wikipedia-vandalism-detection.png?branch=develop)](https://travis-ci.org/paulgoetze/wikipedia-vandalism-detection)
10
+
11
+ ## What You can do with it
12
+
13
+ * parsing Wikipedia history pages to get edits and revisions
14
+ * creating training and test ARFF files from
15
+ the [WVC-PAN-10](https://www.uni-weimar.de/en/media/chairs/computer-science-and-media/webis/corpora/corpus-pan-wvc-10) and
16
+ the [WVC-PAN-11](https://www.uni-weimar.de/en/media/chairs/computer-science-and-media/webis/corpora/corpus-pan-wvc-11)
17
+ (See also http://pan.webis.de under category Wikipedia Vandalism Detection: [CLEF 2010](http://pan.webis.de/clef10/pan10-web/wikipedia-vandalism-detection) & [CLEF 2011](http://pan.webis.de/clef11/pan11-web/wikipedia-vandalism-detection))
18
+
19
+ * calculating vandalism features for a Wikipedia page (XML) from the history dump
20
+ * creating and evaluating a classifier with the created training ARFF file
21
+ * classifing new instances of Wikipedia edits as 'regular' or 'vandalism'
22
+
23
+ ## Installation
24
+
25
+ Add this line to your application's Gemfile:
26
+
27
+ gem 'wikipedia-vandalism_detection'
28
+
29
+ And then execute:
30
+
31
+ $ bundle
32
+
33
+ Or install it yourself as:
34
+
35
+ $ gem install wikipedia-vandalism_detection
36
+
37
+ ## Usage
38
+
39
+ require 'wikipedia/vandalism_detection'
40
+
41
+ ### Configuration
42
+
43
+ To configure the system put a `wikipedia-vandalism-detection.yml` file in the
44
+ `config/` or `lib/config/` directory.
45
+
46
+ You can configure:
47
+
48
+ A) the training and test corpora directories and essential input and output files
49
+
50
+ ```YAML
51
+ corpora:
52
+ base_directory: /home/user/corpora
53
+
54
+ training:
55
+ base_directory: training
56
+ annotations_file: annotations.csv
57
+ edits_file: edits.csv
58
+ revisions_directory: revisions
59
+
60
+ test:
61
+ base_directory: test
62
+ edits_file: edits.csv
63
+ revisions_directory: revisons
64
+
65
+ output:
66
+ base_directory: /home/user/output_path
67
+ training:
68
+ arff_file: training.arff
69
+ index_file: training_index.yml
70
+ test:
71
+ arff_file: test.arff
72
+ index_file: test_index.yml
73
+ ```
74
+
75
+ Evaluation outputs are saved under the output base directory path.
76
+
77
+ B) the features used by the feature calculator
78
+
79
+ ```YAML
80
+ features:
81
+ - anonymity
82
+ - biased frequency
83
+ - character sequence
84
+ - ...
85
+ ```
86
+
87
+ C) the classifier type and its options and the number of cross validation splits
88
+ for the classifier evaluation
89
+
90
+ ```YAML
91
+ classifier:
92
+ type: Trees::RandomForest # Weka classifier class
93
+ options: -I 10 -K 0.5 # same as for Weka, for further classifier options see Weka-dev documentation
94
+ cross-validation-fold: 5 # default is 10
95
+ training-data-options: balanced # default is unbalanced
96
+ ```
97
+
98
+ `training-data-options` is used to resample the training dataset:
99
+
100
+ * `unbalanced` is the default value and uses the original dataset
101
+ * `balanced` uses random undersampling of the majority class
102
+ * `oversampled` uses SMOTE oversampling (with percentage `-p`) and random undersampling (with minority/majority class balance `-u`)
103
+
104
+ Examples:
105
+
106
+ ```YAML
107
+ # 200% SMOTE oversampling with 300% random undersampling
108
+ training-data-options: oversampled -p 200 -u true 300
109
+
110
+ # default 100% SMOTE oversampling with 300% random undersampling
111
+ training-data-options: oversampled -u true 300
112
+
113
+ # 200% SMOTE oversampling with default full (100% minority/majority class balance)
114
+ # random undersampling
115
+ training-data-options: oversampled -p 200
116
+
117
+ # default 100% SMOTE oversampling without undersampling
118
+ training-data-options: oversampled -u false
119
+ ```
120
+
121
+ Instead of the `true` option you can also use `t`, `y` and `yes` as well as their upper case pendants.
122
+
123
+ ### Examples
124
+
125
+ **Create training and test ARFF file from configured corpus:**
126
+
127
+ ```ruby
128
+ training_dataset = Wikipedia::VandalismDetection::TrainingDataset.build
129
+ test_dataset = Wikipedia::VandalismDetection::TestDataset.build
130
+ ```
131
+
132
+ While creating the training and test datasets, for each a corpus file index is created into the configured `index_file`
133
+ directory.
134
+ To run the corpus file index creation manually use:
135
+
136
+ ```ruby
137
+ Wikipedia::VandalismDetection::TrainingDataset.create_file_index!
138
+ Wikipedia::VandalismDetection::TestDataset.create_file_index!
139
+ ```
140
+
141
+ **Parse a Wikipedia page content:**
142
+
143
+ At the moment no namespaces are supported while parsing a page.
144
+ So, the `<page>...</page>` tags should not be included in a namespaced xml tag!
145
+
146
+ ```ruby
147
+ xml = File.read(wikipedia_page.xml)
148
+ parser = Wikipedia::VandalismDetection::PageParser.new
149
+ page = parser.parse(xml)
150
+
151
+ # Work with revisions and edits from the page
152
+ page.revisions.each do |revision|
153
+ puts revison.id
154
+ puts revison.parent_id
155
+ end
156
+
157
+ page.edits.each do |edit|
158
+ puts edit.new_revision.id
159
+ puts edit.old_revision.id
160
+ end
161
+ ```
162
+
163
+ **Use a classifier of configured type:**
164
+
165
+ Create the classifier:
166
+
167
+ ```ruby
168
+ classifier = Wikipedia::VandalismDetection::Classifier.new
169
+ ```
170
+
171
+ Evaluation of the classifier against the configured training corpus:
172
+
173
+ ```ruby
174
+ # classifier.classifier_instance returns the weka classifier instance
175
+ evaluation = classifier.classifier_instance.cross_validate(folds: 10)
176
+ puts evaluation.class_details
177
+ ```
178
+
179
+ Classify a new edit:
180
+
181
+ ```ruby
182
+ # Classification of a Wikipedia Edit or a feature set
183
+ # 'edit' is a Wikipedia::VandalismDetection::Edit, this can be built manually or by
184
+ # parsing a Wikipedia page content and getting its edits
185
+ # The returned confidence is a value between 0.0 and 1.0 were 0.0 means 'regular' and 1.0 means 'vandalism'
186
+ confidence = classifier.classify(edit)
187
+
188
+ feature_calculator = Wikipedia::VandalismDetection::FeatureCalculator.new
189
+ features = feature_calculator.calculate_features_for(edit)
190
+ confidence = classifier.classify(features)
191
+ ```
192
+
193
+ Evaluate test corpus classification:
194
+
195
+ ```ruby
196
+ evaluator = classifier.evaluator
197
+ # or create a new evaluator
198
+ evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
199
+
200
+ performance_data = evaluator.evaluate_testcorpus_classification #default sample_count = 100
201
+ performance_data = evaluator.evaluate_testcorpus_classification(sample_count: 200)
202
+
203
+ # following attributes can be used for further computations
204
+ recall_values = performance_data[:recalls] # recall values for e.g. x-values of PRC or y-values of ROC
205
+ precision_values = performance_data[:precisions] # precision values for e.g. y-values of PRC
206
+ fp_rate_values = performance_data[:fp_rates] # false positive rate values for e.g. x-values of ROC
207
+ area_under_curve_pr = performance_data[:pr_auc] # computed from the precision and recall values
208
+ area_under_curve_ro = performance_data[:roc_auc] # computed from the recall and fp-rate values
209
+ total_recall = performance_data[:total_recall] # precison and recall values with maximum area (rectangle area)
210
+ total_precision = performance_data[:total_precision]
211
+ ```
212
+
213
+ Get each features predictive value for analysis:
214
+
215
+ ```ruby
216
+ evaluator = classifier.evaluator
217
+ # or create a new evaluator
218
+ evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
219
+
220
+ analysis_data = evaluator.feature_analysis #default sample_count = 100
221
+ analysis_data = evaluator.feature_analysis(sample_count: 1000)
222
+ ```
223
+
224
+ This returns a hash comprising all feature names as configured as keys and the threshold hashes as values.
225
+
226
+ ```ruby
227
+ {
228
+ feature_name_1:
229
+ {
230
+ 0.0 => {fp:… , fn:… , tp:… , tn:… },
231
+ …,
232
+ 1.0 => {fp:… , fn:… , tp:… , tn:… }
233
+ },
234
+ …,
235
+ feature_name_n:
236
+ {
237
+ 0.0 => {fp:… , fn:… , tp:… , tn:… },
238
+ …,
239
+ 1.0 => {fp:… , fn:… , tp:… , tn:… }
240
+ },
241
+ }
242
+ ```
243
+
244
+ **Creating new Features:**
245
+
246
+ You can define your own new Feature classes and use them by configuration in the config.yml.
247
+
248
+ Make sure to define the Feature class inside of the `Wikipedia::VandalismDetection::Features` module
249
+ and to implement the `calculate` method
250
+ (also refer to the `Wikipedia::VandalismDetection::Features::Base` class definition).
251
+
252
+ ```ruby
253
+ module Wikipedia
254
+ module VandalismDetection
255
+ module Features
256
+ class MyNewFeature < Base
257
+ def calculate(edit)
258
+ super # ensures raising an error if 'edit' is not an Edit.
259
+
260
+ # ...your implementation
261
+ end
262
+ end
263
+ end
264
+ end
265
+ end
266
+ ```
267
+
268
+ While creating new Feature classes you should be aware of the following naming convention:
269
+ The feature's name in the config.yml is the *downcased name with spaces or dashes* of the feature class name
270
+
271
+ E.g.:
272
+
273
+ ```YAML
274
+ features:
275
+ - my new feature
276
+ - my-new-feature
277
+ ```
278
+
279
+ both search for a Feature class with the name `MyNewFeature`.
280
+
281
+
282
+ ## Contributing
283
+
284
+ 1. Fork it ( http://github.com/paulgoetze/wikipedia-vandalism_detection/fork )
285
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
286
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
287
+ 4. Push to the branch (`git push origin my-new-feature`)
288
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
7
+
8
+ desc 'Start an irb session with the gem loaded'
9
+ task :irb do
10
+ sh 'irb -I ./lib -r wikipedia/vandalism_detection'
11
+ end
@@ -0,0 +1,103 @@
1
+ # Configuring the training and test corpora directories and essential input and output files.
2
+ # As corpora the WVC-PAN-10 and WVC-PAN-11 can be used (see http://webis.de/ under Research -> Corpora).
3
+
4
+ corpora:
5
+ base_directory: /home/user/corpora
6
+
7
+ training:
8
+ base_directory: training
9
+ annotations_file: annotations.csv
10
+ edits_file: edits.csv
11
+ revisions_directory: revisions
12
+
13
+ test:
14
+ base_directory: test
15
+ edits_file: edits.csv
16
+ revisions_directory: revisons
17
+
18
+ output:
19
+ base_directory: /home/user/output_path
20
+ training:
21
+ arff_file: training.arff
22
+ index_file: training_index.yml
23
+ test:
24
+ arff_file: test.arff
25
+ index_file: test_index.yml
26
+
27
+
28
+ # Configuring the used features.
29
+ # See
30
+
31
+ features:
32
+ - anonymity
33
+ - anonymity previous
34
+ - all wordlists frequency
35
+ - all wordlists impact
36
+ - article size
37
+ - bad frequency
38
+ - bad impact
39
+ - biased frequency
40
+ - biased impact
41
+ - blanking
42
+ - character sequence
43
+ - character diversity
44
+ - comment length
45
+ - comment biased frequency
46
+ - comment pronoun frequency
47
+ - comment vulgarism frequency
48
+ - compressibility
49
+ - copyedit
50
+ - digit ratio
51
+ - edits per user
52
+ - emoticons frequency
53
+ - emoticons impact
54
+ - inserted size
55
+ - inserted words
56
+ - inserted character distribution
57
+ - inserted external links
58
+ - inserted internal links
59
+ - longest word
60
+ - markup frequency
61
+ - markup impact
62
+ - non-alphanumeric ratio
63
+ - personal life
64
+ - pronoun frequency
65
+ - pronoun impact
66
+ - removed size
67
+ - removed words
68
+ - removed all wordlists frequency
69
+ - removed bad frequency
70
+ - removed biased frequency
71
+ - removed character distribution
72
+ - removed emoticons frequency
73
+ - removed markup frequency
74
+ - removed pronoun frequency
75
+ - removed sex frequency
76
+ - removed vulgarism frequency
77
+ - replacement similarity
78
+ - reverted
79
+ - revisions character distribution
80
+ - sex frequency
81
+ - sex impact
82
+ - same editor
83
+ - size increment
84
+ - size ratio
85
+ - term frequency
86
+ - time interval
87
+ - time of day
88
+ - upper case ratio
89
+ - upper case words ratio
90
+ - upper to lower case ratio
91
+ - vulgarism frequency
92
+ - vulgarism impact
93
+ - weekday
94
+ - words increment
95
+
96
+
97
+ # Configuring the used classifier
98
+
99
+ classifier:
100
+ type: Trees::RandomForest # Weka classifier class
101
+ options: -I 10 -K 0.5 # same as for Weka, for further classifier options see the Weka-dev documentation
102
+ cross-validation-fold: 5 # default is 10
103
+ training-data-options: balanced # default is unbalanced
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,21 @@
1
+ require 'weka'
2
+ require 'weka/class_builder'
3
+
4
+ module Weka
5
+ module Classifiers
6
+ module Meta
7
+ require 'java/oneClassClassifier.jar'
8
+ include ClassBuilder
9
+
10
+ # One class classifier by C. Hempstalk (cite: http://dl.acm.org/citation.cfm?id=1431987)
11
+ # Jar can be downloaded at: http://sourceforge.net/projects/weka/files/weka-packages/oneClassClassifier1.0.4.zip
12
+ build_class :OneClassClassifier
13
+
14
+ class OneClassClassifier
15
+ def self.type
16
+ 'Meta::OneClassClassifier'
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ require 'weka'
2
+ require 'weka/class_builder'
3
+
4
+ module Weka
5
+ module Classifiers
6
+ module Meta
7
+ require 'java/realAdaBoost.jar'
8
+ include ClassBuilder
9
+
10
+ # Real ada boost classifier, see: http://www.stanford.edu/~hastie/Papers/AdditiveLogisticRegression/alr.pdf
11
+ # Jar can be downloaded at: http://prdownloads.sourceforge.net/weka/realAdaBoost1.0.1.zip?download
12
+ build_class :RealAdaBoost
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,16 @@
1
+ require 'weka'
2
+ require 'weka/class_builder'
3
+
4
+ module Weka
5
+ module Classifiers
6
+ module Trees
7
+ require 'java/balancedRandomForest.jar'
8
+ include ClassBuilder
9
+
10
+ # balanced RandomForest classifier,
11
+ # Modified from https://github.com/jdurbin/durbinlib/blob/master/src/durbin/weka/BalancedRandomForest.java
12
+ # and https://github.com/jdurbin/durbinlib/blob/master/src/durbin/weka/BalancedRandomTree.java
13
+ build_class :BalancedRandomForest
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,15 @@
1
+ require 'weka'
2
+ require 'weka/class_builder'
3
+
4
+ module Weka
5
+ module Filters
6
+ module Supervised
7
+ module Instance
8
+ require 'java/SMOTE.jar'
9
+ include ClassBuilder
10
+
11
+ build_class :SMOTE
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,103 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Algorithms
6
+ class KullbackLeiblerDivergence
7
+ ALLOWED_ERROR = 9e-6
8
+
9
+ # Returns the Symmetric Kullback-Leibler divergence with simple back-off
10
+ # of the given text's character distribution. For implementation details
11
+ # see: https://web.archive.org/web/20130508191111/http://staff.science.uva.nl/~tsagias/?p=185.
12
+ def of(text_a, text_b)
13
+ text_a = cleanup_text(text_a)
14
+ text_b = cleanup_text(text_b)
15
+
16
+ unless text_a.match(/[[:alnum:]]/) && text_b.match(/[[:alnum:]]/)
17
+ return Features::MISSING_VALUE
18
+ end
19
+
20
+ distribution_a = character_distribution(text_a)
21
+ distribution_b = character_distribution(text_b)
22
+
23
+ sum_a = distribution_a.values.inject(0, :+)
24
+ sum_b = distribution_b.values.inject(0, :+)
25
+
26
+ character_diff = distribution_b.keys - distribution_a.keys
27
+
28
+ epsilon = [
29
+ distribution_a.values.min / sum_a,
30
+ distribution_b.values.min / sum_b
31
+ ].min * 0.001
32
+
33
+ gamma = 1 - character_diff.size * epsilon
34
+
35
+ check_integrity(distribution_a, sum_a)
36
+ check_integrity(distribution_b, sum_b)
37
+
38
+ divergence = 0.0
39
+
40
+ distribution_a.each do |character, distribution|
41
+ prob_a = distribution / sum_a
42
+
43
+ character_distribution = distribution_b[character]
44
+
45
+ prob_b =
46
+ if character_distribution
47
+ gamma * (character_distribution / sum_b)
48
+ else
49
+ epsilon
50
+ end
51
+
52
+ divergence += (prob_a - prob_b) * Math.log(prob_a / prob_b)
53
+ end
54
+
55
+ divergence
56
+ end
57
+
58
+ private
59
+
60
+ # Removes invalid utf-8 characters
61
+ def cleanup_text(text)
62
+ text.encode(
63
+ 'UTF-8',
64
+ 'binary',
65
+ invalid: :replace,
66
+ undef: :replace,
67
+ replace: ''
68
+ )
69
+ end
70
+
71
+ # Returns a hash representing each character's distribution
72
+ def character_distribution(text)
73
+ distribution = {}
74
+ return distribution if text.empty?
75
+
76
+ characters = text.downcase.scan(/[[:alnum:]]/)
77
+
78
+ characters.each do |character|
79
+ if distribution.key?(character.to_sym)
80
+ distribution[character.to_sym] += 1
81
+ else
82
+ distribution[character.to_sym] = 1
83
+ end
84
+ end
85
+
86
+ Hash[distribution.map do |key, value|
87
+ [key, value / characters.count.to_f]
88
+ end]
89
+ end
90
+
91
+ # Checks if values sum up to 1.0, raises an error if they don't.
92
+ def check_integrity(distribution, sum)
93
+ difference = 1.0 - distribution.values
94
+ .inject(0) { |result, value| result + (value / sum) }.abs
95
+
96
+ return if difference <= ALLOWED_ERROR
97
+
98
+ raise(Exception, 'Text distribution does not sum up to 1.0')
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end