wikipedia-vandalism_detection 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: bf756c5448798deaecad9dff7f1158124f1665eae7f65e6e3cd1c018dcb4b273
4
+ data.tar.gz: ec45e4a4a402eb9dadada7570f094cd5be294634da3e31ce28603bd48666e74c
5
+ SHA512:
6
+ metadata.gz: a72ec32117e19bbac2764eb01022f608c4eb91121e6d552c1a05a230b559a5279e51fe8e7970b48667d6450ebb0b23fc36338ade74bb47d729018fbdb4b39868
7
+ data.tar.gz: 8eb0fb8fe4d2e0ed681543cf0a76dd9a806253cf8e43ce2dd224137ad0970d1f7e9f84caf2b1fd22f289d3553414e449799d5f50c010d435ac2d6a3d5afa4a93
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ spec/resources/build
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ /config/*.yml
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --force-color
2
+ --order rand
data/.rubocop.yml ADDED
@@ -0,0 +1,35 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.4
3
+ Exclude:
4
+ - 'bin/**/*'
5
+ - '*.gemspec'
6
+ - 'Gemfile'
7
+ - 'Gemfile.lock'
8
+
9
+ Style/Copyright:
10
+ Enabled: false
11
+
12
+ Style/Documentation:
13
+ Enabled: false
14
+
15
+ Metrics/LineLength:
16
+ Max: 80
17
+ Exclude:
18
+ - '**/*_spec.rb'
19
+ - 'spec/factories/*.rb'
20
+
21
+ Layout/MultilineMethodCallIndentation:
22
+ EnforcedStyle: indented
23
+
24
+ Style/FrozenStringLiteralComment:
25
+ Enabled: false
26
+
27
+ Metrics/ModuleLength:
28
+ Exclude:
29
+ - '**/*_spec.rb'
30
+ - 'spec/factories/*.rb'
31
+
32
+ Metrics/BlockLength:
33
+ Exclude:
34
+ - '**/*_spec.rb'
35
+ - 'spec/factories/*.rb'
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - jruby-9.1.0.0
5
+ - jruby-9.2.0.0
6
+ - jruby-head
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in wikipedia-vandalism_detection.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,4 @@
1
+ Copyright (c) 2014-2018 Paul Götze
2
+
3
+ This software is licensed under the GPL v3.
4
+ For further information and the full license text see: http://www.gnu.org/licenses/gpl-3.0.en.html
data/README.md ADDED
@@ -0,0 +1,288 @@
1
+ # Wikipedia Vandalism Detection
2
+
3
+ Vandalism detection on the Wikipedia history with JRuby v9.1.0.0+.
4
+
5
+ The Wikipedia Vandalism Detection Gem uses the Weka Machine-Learning Library
6
+ via the [weka](https://github.com/paulgoetze/weka-jruby) gem.
7
+
8
+ [![Gem Version](https://badge.fury.io/rb/wikipedia-vandalism_detection.svg)](http://badge.fury.io/rb/wikipedia-vandalism_detection)
9
+ [![Build Status](https://travis-ci.org/paulgoetze/wikipedia-vandalism-detection.png?branch=develop)](https://travis-ci.org/paulgoetze/wikipedia-vandalism-detection)
10
+
11
+ ## What You can do with it
12
+
13
+ * parsing Wikipedia history pages to get edits and revisions
14
+ * creating training and test ARFF files from
15
+ the [WVC-PAN-10](https://www.uni-weimar.de/en/media/chairs/computer-science-and-media/webis/corpora/corpus-pan-wvc-10) and
16
+ the [WVC-PAN-11](https://www.uni-weimar.de/en/media/chairs/computer-science-and-media/webis/corpora/corpus-pan-wvc-11)
17
+ (See also http://pan.webis.de under category Wikipedia Vandalism Detection: [CLEF 2010](http://pan.webis.de/clef10/pan10-web/wikipedia-vandalism-detection) & [CLEF 2011](http://pan.webis.de/clef11/pan11-web/wikipedia-vandalism-detection))
18
+
19
+ * calculating vandalism features for a Wikipedia page (XML) from the history dump
20
+ * creating and evaluating a classifier with the created training ARFF file
21
+ * classifing new instances of Wikipedia edits as 'regular' or 'vandalism'
22
+
23
+ ## Installation
24
+
25
+ Add this line to your application's Gemfile:
26
+
27
+ gem 'wikipedia-vandalism_detection'
28
+
29
+ And then execute:
30
+
31
+ $ bundle
32
+
33
+ Or install it yourself as:
34
+
35
+ $ gem install wikipedia-vandalism_detection
36
+
37
+ ## Usage
38
+
39
+ require 'wikipedia/vandalism_detection'
40
+
41
+ ### Configuration
42
+
43
+ To configure the system put a `wikipedia-vandalism-detection.yml` file in the
44
+ `config/` or `lib/config/` directory.
45
+
46
+ You can configure:
47
+
48
+ A) the training and test corpora directories and essential input and output files
49
+
50
+ ```YAML
51
+ corpora:
52
+ base_directory: /home/user/corpora
53
+
54
+ training:
55
+ base_directory: training
56
+ annotations_file: annotations.csv
57
+ edits_file: edits.csv
58
+ revisions_directory: revisions
59
+
60
+ test:
61
+ base_directory: test
62
+ edits_file: edits.csv
63
+ revisions_directory: revisons
64
+
65
+ output:
66
+ base_directory: /home/user/output_path
67
+ training:
68
+ arff_file: training.arff
69
+ index_file: training_index.yml
70
+ test:
71
+ arff_file: test.arff
72
+ index_file: test_index.yml
73
+ ```
74
+
75
+ Evaluation outputs are saved under the output base directory path.
76
+
77
+ B) the features used by the feature calculator
78
+
79
+ ```YAML
80
+ features:
81
+ - anonymity
82
+ - biased frequency
83
+ - character sequence
84
+ - ...
85
+ ```
86
+
87
+ C) the classifier type and its options and the number of cross validation splits
88
+ for the classifier evaluation
89
+
90
+ ```YAML
91
+ classifier:
92
+ type: Trees::RandomForest # Weka classifier class
93
+ options: -I 10 -K 0.5 # same as for Weka, for further classifier options see Weka-dev documentation
94
+ cross-validation-fold: 5 # default is 10
95
+ training-data-options: balanced # default is unbalanced
96
+ ```
97
+
98
+ `training-data-options` is used to resample the training dataset:
99
+
100
+ * `unbalanced` is the default value and uses the original dataset
101
+ * `balanced` uses random undersampling of the majority class
102
+ * `oversampled` uses SMOTE oversampling (with percentage `-p`) and random undersampling (with minority/majority class balance `-u`)
103
+
104
+ Examples:
105
+
106
+ ```YAML
107
+ # 200% SMOTE oversampling with 300% random undersampling
108
+ training-data-options: oversampled -p 200 -u true 300
109
+
110
+ # default 100% SMOTE oversampling with 300% random undersampling
111
+ training-data-options: oversampled -u true 300
112
+
113
+ # 200% SMOTE oversampling with default full (100% minority/majority class balance)
114
+ # random undersampling
115
+ training-data-options: oversampled -p 200
116
+
117
+ # default 100% SMOTE oversampling without undersampling
118
+ training-data-options: oversampled -u false
119
+ ```
120
+
121
+ Instead of the `true` option you can also use `t`, `y` and `yes` as well as their upper case pendants.
122
+
123
+ ### Examples
124
+
125
+ **Create training and test ARFF file from configured corpus:**
126
+
127
+ ```ruby
128
+ training_dataset = Wikipedia::VandalismDetection::TrainingDataset.build
129
+ test_dataset = Wikipedia::VandalismDetection::TestDataset.build
130
+ ```
131
+
132
+ While creating the training and test datasets, for each a corpus file index is created into the configured `index_file`
133
+ directory.
134
+ To run the corpus file index creation manually use:
135
+
136
+ ```ruby
137
+ Wikipedia::VandalismDetection::TrainingDataset.create_file_index!
138
+ Wikipedia::VandalismDetection::TestDataset.create_file_index!
139
+ ```
140
+
141
+ **Parse a Wikipedia page content:**
142
+
143
+ At the moment no namespaces are supported while parsing a page.
144
+ So, the `<page>...</page>` tags should not be included in a namespaced xml tag!
145
+
146
+ ```ruby
147
+ xml = File.read(wikipedia_page.xml)
148
+ parser = Wikipedia::VandalismDetection::PageParser.new
149
+ page = parser.parse(xml)
150
+
151
+ # Work with revisions and edits from the page
152
+ page.revisions.each do |revision|
153
+ puts revison.id
154
+ puts revison.parent_id
155
+ end
156
+
157
+ page.edits.each do |edit|
158
+ puts edit.new_revision.id
159
+ puts edit.old_revision.id
160
+ end
161
+ ```
162
+
163
+ **Use a classifier of configured type:**
164
+
165
+ Create the classifier:
166
+
167
+ ```ruby
168
+ classifier = Wikipedia::VandalismDetection::Classifier.new
169
+ ```
170
+
171
+ Evaluation of the classifier against the configured training corpus:
172
+
173
+ ```ruby
174
+ # classifier.classifier_instance returns the weka classifier instance
175
+ evaluation = classifier.classifier_instance.cross_validate(folds: 10)
176
+ puts evaluation.class_details
177
+ ```
178
+
179
+ Classify a new edit:
180
+
181
+ ```ruby
182
+ # Classification of a Wikipedia Edit or a feature set
183
+ # 'edit' is a Wikipedia::VandalismDetection::Edit, this can be built manually or by
184
+ # parsing a Wikipedia page content and getting its edits
185
+ # The returned confidence is a value between 0.0 and 1.0 were 0.0 means 'regular' and 1.0 means 'vandalism'
186
+ confidence = classifier.classify(edit)
187
+
188
+ feature_calculator = Wikipedia::VandalismDetection::FeatureCalculator.new
189
+ features = feature_calculator.calculate_features_for(edit)
190
+ confidence = classifier.classify(features)
191
+ ```
192
+
193
+ Evaluate test corpus classification:
194
+
195
+ ```ruby
196
+ evaluator = classifier.evaluator
197
+ # or create a new evaluator
198
+ evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
199
+
200
+ performance_data = evaluator.evaluate_testcorpus_classification #default sample_count = 100
201
+ performance_data = evaluator.evaluate_testcorpus_classification(sample_count: 200)
202
+
203
+ # following attributes can be used for further computations
204
+ recall_values = performance_data[:recalls] # recall values for e.g. x-values of PRC or y-values of ROC
205
+ precision_values = performance_data[:precisions] # precision values for e.g. y-values of PRC
206
+ fp_rate_values = performance_data[:fp_rates] # false positive rate values for e.g. x-values of ROC
207
+ area_under_curve_pr = performance_data[:pr_auc] # computed from the precision and recall values
208
+ area_under_curve_ro = performance_data[:roc_auc] # computed from the recall and fp-rate values
209
+ total_recall = performance_data[:total_recall] # precison and recall values with maximum area (rectangle area)
210
+ total_precision = performance_data[:total_precision]
211
+ ```
212
+
213
+ Get each features predictive value for analysis:
214
+
215
+ ```ruby
216
+ evaluator = classifier.evaluator
217
+ # or create a new evaluator
218
+ evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
219
+
220
+ analysis_data = evaluator.feature_analysis #default sample_count = 100
221
+ analysis_data = evaluator.feature_analysis(sample_count: 1000)
222
+ ```
223
+
224
+ This returns a hash comprising all feature names as configured as keys and the threshold hashes as values.
225
+
226
+ ```ruby
227
+ {
228
+ feature_name_1:
229
+ {
230
+ 0.0 => {fp:… , fn:… , tp:… , tn:… },
231
+ …,
232
+ 1.0 => {fp:… , fn:… , tp:… , tn:… }
233
+ },
234
+ …,
235
+ feature_name_n:
236
+ {
237
+ 0.0 => {fp:… , fn:… , tp:… , tn:… },
238
+ …,
239
+ 1.0 => {fp:… , fn:… , tp:… , tn:… }
240
+ },
241
+ }
242
+ ```
243
+
244
+ **Creating new Features:**
245
+
246
+ You can define your own new Feature classes and use them by configuration in the config.yml.
247
+
248
+ Make sure to define the Feature class inside of the `Wikipedia::VandalismDetection::Features` module
249
+ and to implement the `calculate` method
250
+ (also refer to the `Wikipedia::VandalismDetection::Features::Base` class definition).
251
+
252
+ ```ruby
253
+ module Wikipedia
254
+ module VandalismDetection
255
+ module Features
256
+ class MyNewFeature < Base
257
+ def calculate(edit)
258
+ super # ensures raising an error if 'edit' is not an Edit.
259
+
260
+ # ...your implementation
261
+ end
262
+ end
263
+ end
264
+ end
265
+ end
266
+ ```
267
+
268
+ While creating new Feature classes you should be aware of the following naming convention:
269
+ The feature's name in the config.yml is the *downcased name with spaces or dashes* of the feature class name
270
+
271
+ E.g.:
272
+
273
+ ```YAML
274
+ features:
275
+ - my new feature
276
+ - my-new-feature
277
+ ```
278
+
279
+ both search for a Feature class with the name `MyNewFeature`.
280
+
281
+
282
+ ## Contributing
283
+
284
+ 1. Fork it ( http://github.com/paulgoetze/wikipedia-vandalism_detection/fork )
285
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
286
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
287
+ 4. Push to the branch (`git push origin my-new-feature`)
288
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
7
+
8
+ desc 'Start an irb session with the gem loaded'
9
+ task :irb do
10
+ sh 'irb -I ./lib -r wikipedia/vandalism_detection'
11
+ end
@@ -0,0 +1,103 @@
1
+ # Configuring the training and test corpora directories and essential input and output files.
2
+ # As corpora the WVC-PAN-10 and WVC-PAN-11 can be used (see http://webis.de/ under Research -> Corpora).
3
+
4
+ corpora:
5
+ base_directory: /home/user/corpora
6
+
7
+ training:
8
+ base_directory: training
9
+ annotations_file: annotations.csv
10
+ edits_file: edits.csv
11
+ revisions_directory: revisions
12
+
13
+ test:
14
+ base_directory: test
15
+ edits_file: edits.csv
16
+ revisions_directory: revisons
17
+
18
+ output:
19
+ base_directory: /home/user/output_path
20
+ training:
21
+ arff_file: training.arff
22
+ index_file: training_index.yml
23
+ test:
24
+ arff_file: test.arff
25
+ index_file: test_index.yml
26
+
27
+
28
+ # Configuring the used features.
29
+ # See
30
+
31
+ features:
32
+ - anonymity
33
+ - anonymity previous
34
+ - all wordlists frequency
35
+ - all wordlists impact
36
+ - article size
37
+ - bad frequency
38
+ - bad impact
39
+ - biased frequency
40
+ - biased impact
41
+ - blanking
42
+ - character sequence
43
+ - character diversity
44
+ - comment length
45
+ - comment biased frequency
46
+ - comment pronoun frequency
47
+ - comment vulgarism frequency
48
+ - compressibility
49
+ - copyedit
50
+ - digit ratio
51
+ - edits per user
52
+ - emoticons frequency
53
+ - emoticons impact
54
+ - inserted size
55
+ - inserted words
56
+ - inserted character distribution
57
+ - inserted external links
58
+ - inserted internal links
59
+ - longest word
60
+ - markup frequency
61
+ - markup impact
62
+ - non-alphanumeric ratio
63
+ - personal life
64
+ - pronoun frequency
65
+ - pronoun impact
66
+ - removed size
67
+ - removed words
68
+ - removed all wordlists frequency
69
+ - removed bad frequency
70
+ - removed biased frequency
71
+ - removed character distribution
72
+ - removed emoticons frequency
73
+ - removed markup frequency
74
+ - removed pronoun frequency
75
+ - removed sex frequency
76
+ - removed vulgarism frequency
77
+ - replacement similarity
78
+ - reverted
79
+ - revisions character distribution
80
+ - sex frequency
81
+ - sex impact
82
+ - same editor
83
+ - size increment
84
+ - size ratio
85
+ - term frequency
86
+ - time interval
87
+ - time of day
88
+ - upper case ratio
89
+ - upper case words ratio
90
+ - upper to lower case ratio
91
+ - vulgarism frequency
92
+ - vulgarism impact
93
+ - weekday
94
+ - words increment
95
+
96
+
97
+ # Configuring the used classifier
98
+
99
+ classifier:
100
+ type: Trees::RandomForest # Weka classifier class
101
+ options: -I 10 -K 0.5 # same as for Weka, for further classifier options see the Weka-dev documentation
102
+ cross-validation-fold: 5 # default is 10
103
+ training-data-options: balanced # default is unbalanced
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,21 @@
1
+ require 'weka'
2
+ require 'weka/class_builder'
3
+
4
+ module Weka
5
+ module Classifiers
6
+ module Meta
7
+ require 'java/oneClassClassifier.jar'
8
+ include ClassBuilder
9
+
10
+ # One class classifier by C. Hempstalk (cite: http://dl.acm.org/citation.cfm?id=1431987)
11
+ # Jar can be downloaded at: http://sourceforge.net/projects/weka/files/weka-packages/oneClassClassifier1.0.4.zip
12
+ build_class :OneClassClassifier
13
+
14
+ class OneClassClassifier
15
+ def self.type
16
+ 'Meta::OneClassClassifier'
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ require 'weka'
2
+ require 'weka/class_builder'
3
+
4
+ module Weka
5
+ module Classifiers
6
+ module Meta
7
+ require 'java/realAdaBoost.jar'
8
+ include ClassBuilder
9
+
10
+ # Real ada boost classifier, see: http://www.stanford.edu/~hastie/Papers/AdditiveLogisticRegression/alr.pdf
11
+ # Jar can be downloaded at: http://prdownloads.sourceforge.net/weka/realAdaBoost1.0.1.zip?download
12
+ build_class :RealAdaBoost
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,16 @@
1
+ require 'weka'
2
+ require 'weka/class_builder'
3
+
4
+ module Weka
5
+ module Classifiers
6
+ module Trees
7
+ require 'java/balancedRandomForest.jar'
8
+ include ClassBuilder
9
+
10
+ # balanced RandomForest classifier,
11
+ # Modified from https://github.com/jdurbin/durbinlib/blob/master/src/durbin/weka/BalancedRandomForest.java
12
+ # and https://github.com/jdurbin/durbinlib/blob/master/src/durbin/weka/BalancedRandomTree.java
13
+ build_class :BalancedRandomForest
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,15 @@
1
+ require 'weka'
2
+ require 'weka/class_builder'
3
+
4
+ module Weka
5
+ module Filters
6
+ module Supervised
7
+ module Instance
8
+ require 'java/SMOTE.jar'
9
+ include ClassBuilder
10
+
11
+ build_class :SMOTE
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,103 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Algorithms
6
+ class KullbackLeiblerDivergence
7
+ ALLOWED_ERROR = 9e-6
8
+
9
+ # Returns the Symmetric Kullback-Leibler divergence with simple back-off
10
+ # of the given text's character distribution. For implementation details
11
+ # see: https://web.archive.org/web/20130508191111/http://staff.science.uva.nl/~tsagias/?p=185.
12
+ def of(text_a, text_b)
13
+ text_a = cleanup_text(text_a)
14
+ text_b = cleanup_text(text_b)
15
+
16
+ unless text_a.match(/[[:alnum:]]/) && text_b.match(/[[:alnum:]]/)
17
+ return Features::MISSING_VALUE
18
+ end
19
+
20
+ distribution_a = character_distribution(text_a)
21
+ distribution_b = character_distribution(text_b)
22
+
23
+ sum_a = distribution_a.values.inject(0, :+)
24
+ sum_b = distribution_b.values.inject(0, :+)
25
+
26
+ character_diff = distribution_b.keys - distribution_a.keys
27
+
28
+ epsilon = [
29
+ distribution_a.values.min / sum_a,
30
+ distribution_b.values.min / sum_b
31
+ ].min * 0.001
32
+
33
+ gamma = 1 - character_diff.size * epsilon
34
+
35
+ check_integrity(distribution_a, sum_a)
36
+ check_integrity(distribution_b, sum_b)
37
+
38
+ divergence = 0.0
39
+
40
+ distribution_a.each do |character, distribution|
41
+ prob_a = distribution / sum_a
42
+
43
+ character_distribution = distribution_b[character]
44
+
45
+ prob_b =
46
+ if character_distribution
47
+ gamma * (character_distribution / sum_b)
48
+ else
49
+ epsilon
50
+ end
51
+
52
+ divergence += (prob_a - prob_b) * Math.log(prob_a / prob_b)
53
+ end
54
+
55
+ divergence
56
+ end
57
+
58
+ private
59
+
60
+ # Removes invalid utf-8 characters
61
+ def cleanup_text(text)
62
+ text.encode(
63
+ 'UTF-8',
64
+ 'binary',
65
+ invalid: :replace,
66
+ undef: :replace,
67
+ replace: ''
68
+ )
69
+ end
70
+
71
+ # Returns a hash representing each character's distribution
72
+ def character_distribution(text)
73
+ distribution = {}
74
+ return distribution if text.empty?
75
+
76
+ characters = text.downcase.scan(/[[:alnum:]]/)
77
+
78
+ characters.each do |character|
79
+ if distribution.key?(character.to_sym)
80
+ distribution[character.to_sym] += 1
81
+ else
82
+ distribution[character.to_sym] = 1
83
+ end
84
+ end
85
+
86
+ Hash[distribution.map do |key, value|
87
+ [key, value / characters.count.to_f]
88
+ end]
89
+ end
90
+
91
+ # Checks if values sum up to 1.0, raises an error if they don't.
92
+ def check_integrity(distribution, sum)
93
+ difference = 1.0 - distribution.values
94
+ .inject(0) { |result, value| result + (value / sum) }.abs
95
+
96
+ return if difference <= ALLOWED_ERROR
97
+
98
+ raise(Exception, 'Text distribution does not sum up to 1.0')
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end