wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,80 @@
1
+ # The WikitextExtractor imports the WikitextExtractor class from the
2
+ # sweble-wikitext-extractor.jar
3
+ # The sweble-wikitext-extractor.jar is a custom Java project which uses the
4
+ # Sweble wikitext parser to extract plaintext out of wikimarkup text.
5
+ #
6
+ # The Sweble WikitextExtractor currently depends on the swc-engine -v1.1.0 with
7
+ # dependencies,
8
+ # see: http://sweble.org/downloads/swc-devel/master-latest/ to download it.
9
+ #
10
+ # The Java source code can be found on:
11
+ # webis.uni-weimar.de:/srv/cvsroot/code-in-progress/wikipedia-vandalism-detection/sweble-wikitext-extractor
12
+ module Wikipedia
13
+ module VandalismDetection
14
+ require 'java'
15
+ require 'java/swc-engine-1.1.0-jar-with-dependencies.jar'
16
+ require 'java/sweble-wikitext-extractor.jar'
17
+
18
+ java_import 'de.webis.sweble.WikitextExtractor'
19
+
20
+ class WikitextExtractionError < StandardError; end
21
+
22
+ # This class wrapps the de.webis.sweble.WikitextExtractor Java class and
23
+ # provides methods to extract plaintext from wiki markup text both space
24
+ # preserving and cleaned without line breaks and whitespace.
25
+ class WikitextExtractor
26
+ REDIRECT = '#REDIRECT'.freeze
27
+
28
+ class << self
29
+ # Returns the extracted text from the given wiki markup preserving
30
+ # spacing with added section numbers.
31
+ def extract(wiki_text)
32
+ wiki_text = wiki_text.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
33
+ wiki_text = wiki_text.gsub(REDIRECT, '')
34
+
35
+ WikitextExtractor.new.extract(wiki_text)
36
+ rescue => exception
37
+ message = "Wikitext extraction failed: \n#{exception.message}"
38
+ raise WikitextExtractionError, message, caller
39
+ end
40
+
41
+ # Returns the cleaned extracted text from the given wiki markup.
42
+ # Cleaned means a single string without breaks, multiple spaces and
43
+ # section numbers.
44
+ def extract_clean(wiki_text)
45
+ wiki_text = extract wiki_text
46
+
47
+ wiki_text = remove_section_numbering_from wiki_text
48
+ wiki_text = remove_line_breaks_from wiki_text
49
+ wiki_text = remove_uris_from wiki_text
50
+ wiki_text = remove_special_signes_from wiki_text
51
+ wiki_text = remove_multiple_spaces_from wiki_text
52
+ wiki_text.strip
53
+ end
54
+
55
+ private
56
+
57
+ # removes 1., 1.1., 2.3.4. etc. at the beginning of a line
58
+ def remove_section_numbering_from(text)
59
+ text.gsub(/^(\d\.)+/, '')
60
+ end
61
+
62
+ def remove_line_breaks_from(text)
63
+ text.gsub(/\n+/, ' ')
64
+ end
65
+
66
+ def remove_multiple_spaces_from(text)
67
+ text.gsub(/\s+/, ' ')
68
+ end
69
+
70
+ def remove_uris_from(text)
71
+ text.gsub(%r{(https?|ftp)\s?:\s?\/\/[^\s\/$.?#].[^\s]*}i, '')
72
+ end
73
+
74
+ def remove_special_signes_from(text)
75
+ text.gsub(/\[\]\{\}\|\=/, ' ')
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,11 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ BAD = %i[
5
+ 666 da dont dosent whatever guy hi nazi sup guise loser thats ugly wanna
6
+ whats wont gotta bloody fart pot prick stink smells smelly alot dunno
7
+ gotcha
8
+ ].freeze
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,20 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ BIASED = %i[
5
+ acclaimed amazing astonishing authoritative beautiful best brilliant
6
+ canonical celebrated charismatic classic cutting-edge defining
7
+ definitive eminent enigma exciting extraordinary fabulous famous
8
+ infamous fantastic fully genius global great greatest iconic immensely
9
+ impactful incendiary indisputable influential innovative inspired
10
+ intriguing leader leading legendary major masterly mature memorable
11
+ notable outstanding pioneer popular prestigious really remarkable
12
+ renowned respected seminal significant skillful solution single-handedly
13
+ staunch talented most top transcendent undoubtedly unique visionary
14
+ virtually virtuoso well-known well-established world-class worst coolest
15
+ super probably hate ugly fat lame weird strange everyone cares boring
16
+ boreing ever huge like idiotic absolute total totally
17
+ ].freeze
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,26 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ EMOTICONS = [
5
+ :':\)', :':p', :':\(', :';\)', :':D', :';D', :';P', :';p', :':\-\)',
6
+ :':\-\(', :';\-\)', :':\-D', :':\-p', :':\-P', :'8\-\)', :'8\)',
7
+ :'\^\^', :'\*_\*', :'\^_\^', :':\-I', :':\-X', :':\-x', :'X\-p',
8
+ :'X\-P', :':\-\]', :'\^\.\^', :':\*', :':\-\*', :XD, :'X\-D', :'8\-D',
9
+ :'8D', :':\-O', :':\-o', :':\-\|', :'X\-\(', :'X\(', :'\-_\-', :':o\)',
10
+ :':O\)', :'B\-\)', :':O', :':o', :':\-s', :':\-S', :':\-\/', :':\-\\',
11
+ :T_T, :':\*\(', :':\*\-\(', :':\(\(', :'\*\-\*', :':\-\[', :':\->',
12
+ :':\|', :':\-\|', :':\]', :':\[', :'\/:\(', :'\\:\(', :':\-$', :':$',
13
+ :':\-6', :':\-9', :'@_@', :'<3', :'\|\-D', :':0', :':\-0', :o_O,
14
+ :oO, :'\(\-:', :'\(\-;', :'\(:', :'\):', :'\)\-:', :'\(;', :'\(y\)',
15
+ :'\(\.\)\(\.\)', :O_O, :'0_0', :'8\-\[', :'8\-\]', :'8\[', :'8\]',
16
+ :'8\-\(', :'8\(', :':\-', :'%\)', :'%\-\)', :'8\|', :'8\-\|', :'=\)',
17
+ :':\]', :':>', :':c\)', :'\[:', :'<:', :'c:', :'\(x', :'\(o:', :'\(c:',
18
+ :'D:', :':\'\(', :':\'C', :';\(', :';o\)', :'\(o;', :':b', :':p', :'=P',
19
+ :':P', :dx, :xP, :'d\-:', :'d:', :'q:', :'d=', :'d;', :'c\(:', :'=D',
20
+ :'=\-D', :'=O', :'=o', :'=0', :'o=', :'O=', :'0=', :'\^_~', :'>_<',
21
+ :'~_~', :'>:', :':<', :'\(Y\)', :'\(=', :'\)=', :'=\(', :'=\[, :=\]',
22
+ :'\[=', :'\]='
23
+ ].freeze
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,19 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ MARKUP = [
5
+ :'\{\{',
6
+ :'\[\[',
7
+ :infobox,
8
+ :category,
9
+ :defaultsort,
10
+ :'<ref>',
11
+ :cite,
12
+ :__toc__,
13
+ :__forcetoc__,
14
+ :defaultsort,
15
+ :reflist
16
+ ].freeze
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,11 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ PRONOUNS = %i[
5
+ i me myself mine my we us ourselves ourself ours our you yourself yours
6
+ your thou thee thyself thine thy yourselves y'all youse you-uns y'all
7
+ youse yous yis yourselves y'all's selves yous's
8
+ ].freeze
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,10 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ SEX = %i[
5
+ vagina sex anal penis breast breasts buttocks sodomy sodomized erect
6
+ nipple nipples vibrator vibrator dildo dildos
7
+ ].freeze
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,96 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ # This list is taken from https://github.com/snipe/banbuilder and can be
5
+ # downloaded from:
6
+ # https //:github.com/snipe/banbuilder/blob/master/word-dbs/wordlist.csv
7
+ VULGARISM = %i[
8
+ $#!+ $1ut $h1t $hit $lut 'ho 'hobag a$$ anus ass assmunch b1tch
9
+ ballsack bastard beaner beastiality biatch beeyotch bitchy
10
+ blow blowjob bollock bollocks bollok boner boob bugger buttplug
11
+ c-0-c-k c-o-c-k c-u-n-t c.0.c.k c.o.c.k. c.u.n. jerk jackoff
12
+ jackhole j3rk0ff homo hom0 hobag hell h0mo h0m0 goddamn goddammit
13
+ godamnit ghey ghay gfy gay fudgepacker fuckwad fucktard fuckoff
14
+ fucker fuck-tard fuck fellatio fellate felching felcher felch
15
+ fartknocker fart fannybandit fanny faggot fagg fag f.u.c.k f-u-c-k
16
+ dyke douchebag douche douch3 doosh dike dick damnit damn dammit d1ldo
17
+ d1ld0 d1ck d0uche d0uch3 cunt cumstain cum crap coon cock clitoris
18
+ clit cl1t cawk c0ck jerk0ff jerkoff jizz knobend labia lmfao moolie
19
+ muff nigga nigger p.u.s.s.y. piss piss-off pissoff prick pube pussy
20
+ queer retard retarded s-h-1-t s-h-i-t s.h.i.t. scrotum sh1t shit slut
21
+ smegma t1t tard terd tit tits titties turd twat vag wank wetback
22
+ whore whoreface 'f*ck' sh*t pu$$y p*ssy diligaf wtf stfu fu*ck fack
23
+ shite fxck sh!t @sshole assh0le assho!e a$$hole a$$h0le a$$h0!e
24
+ a$$h01e assho1e wh0re f@g f@gg0t f@ggot motherf*cker mofo cuntlicker
25
+ cuntface dickbag cockknocker beatch fucknut nucking futs mams cunny
26
+ quim clitty kike spic wop chink humper feltch feltcher fvck ahole
27
+ nads spick douchey bullturds gonads bitch butt fellatio lmao s-o-b
28
+ spunk he11 jizm jism bukkake shiz wigger gook ritard reetard
29
+ masterbate masturbate goatse masterbating masturbating hitler nazi
30
+ tubgirl gtfo foad r-tard rtard hoor g-spot gspot vulva assmaster
31
+ viagra phuck frack fuckwit assbang assbanged assbangs asshole
32
+ assholes asswipe asswipes b1tch bastards bitched bitches boners
33
+ bullshit bullshits bullshitted cameltoe chinc chincs chink chode
34
+ chodes clit clits cocks coons cumming cunts d1ck dickhead dickheads
35
+ doggie-style douchebags dumass dumbass dumbasses dykes faggit fags
36
+ fucked fucker fuckface fucks godamnit gooks humped humping jackass
37
+ jap japs jerk jizzed kikes knobend kooch kooches kootch fuckers
38
+ motherfucking niggah niggas niggers p.u.s.s.y. pussies queers rim s0b
39
+ shitface shithead shits shitted s.o.b. spik spiks twats whack whores
40
+ zoophile m-fucking mthrfucking muthrfucking mutherfucking
41
+ mutherfucker mtherfucker mthrfucker mthrf*cker whorehopper copulator
42
+ whoralicious whorealicious aeolus analprobe areola areole aryan arian
43
+ asses assfuck azazel baal babes bang banger barf bawdy beardedclam
44
+ beater beaver beer bigtits bimbo blew blow blowjobs blowup bod bodily
45
+ boink bone boned bong boobies boobs booby booger bookie booky bootee
46
+ bootie booty booze boozer boozy bosom bosomy bowel bowels bra
47
+ brassiere bung babe bush buttfuck cocaine kinky klan panties
48
+ pedophile pedophilia pedophiliac punkass queaf rape scantily essohbee
49
+ shithouse smut snatch toots doggie anorexia bulimia bulimiic burp
50
+ busty buttfucker caca cahone carnal carpetmuncher cervix climax
51
+ cocain cocksucker coital coke commie condom corpse coven crabs crack
52
+ crackwhore crappy cuervo cummin cumshot cumshots cunnilingus dago
53
+ dagos damned dick-ish dickish dickweed anorexic prostitute marijuana
54
+ lsd pcp diddle dawgie-style dimwit dingle doofus dopey douche drunk
55
+ dummy ejaculate enlargement erect erotic exotic extacy extasy faerie
56
+ faery fagged fagot fairy fisted fisting fisty floozy fondle foobar
57
+ foreskin frigg frigga fubar fucking fuckup ganja gays glans godamn
58
+ goddam goldenshower gonad gonads handjob hebe hemp heroin herpes
59
+ hijack hiv homey honky hooch hookah hooker hootch hooter hooters hump
60
+ hussy hymen inbred incest injun jerked jiz jizm horny junkie junky
61
+ kill kkk kraut kyke lech leper lesbians lesbos lez lezbian lezbians
62
+ lezbo lezbos lezzie lezzies lezzy loin loins lube lust lusty massa
63
+ masterbation masturbation maxi menses menstruate menstruation meth
64
+ molest moron motherfucka motherfucker murder muthafucker nad naked
65
+ napalm nappy nazism negro niggle nimrod ninny nooky nympho opiate
66
+ opium oral orally organ orgasm orgies orgy ovary ovum ovums paddy
67
+ pantie panty pastie pasty pecker pedo pee peepee penetrate
68
+ penetration penial penile perversion peyote phalli phallic
69
+ pillowbiter pimp pinko pissed pms polack porn porno pornography pot
70
+ potty prig prude pubic pubis punky puss queef queefing quife quicky
71
+ racist racy raped raper rapist raunch rectal rectum rectus reefer
72
+ reich revue risque rum rump sadism sadist satan scag schizo screw
73
+ screwed scrog scrot scrote scrud scum seaman seamen seduce semen
74
+ sex_story sexual shithole shitter shitty s*o*b sissy skag slave
75
+ sleaze sleazy sluts smutty sniper snuff sodom souse soused sperm
76
+ spooge stab steamy stiffy stoned strip stroke whacking suck sucked
77
+ sucking tampon tawdry teat teste testee testes testis thrust thug
78
+ tinkle titfuck titi titty whacked toke tramp trashy tush undies unwed
79
+ urinal urine uterus uzi valium virgin vixen vodka vomit voyeur vulgar
80
+ wad wazoo wedgie weed weenie weewee weiner weirdo wench whitey whiz
81
+ whored whorehouse whoring womb woody x-rated xxx b@lls yeasty yobbo
82
+ sumofabiatch doggy-style doggy wang dong d0ng w@ng wh0reface
83
+ wh0ref@ce wh0r3f@ce tittyfuck tittyfucker tittiefucker cockholster
84
+ cockblock gai gey faig faigt a55 a55hole gae corksucker rumprammer
85
+ slutdumper niggaz muthafuckaz gigolo pussypounder herp herpy
86
+ transsexual orgasmic cunilingus anilingus dickdipper dickwhipper
87
+ dicksipper dickripper dickflipper dickzipper homoey queero freex
88
+ cunthunter shamedame slutkiss shiteater fuckass fucka$$ clitorus
89
+ assfucker assfuckers dillweed cracker teabagging shitt azz fuk
90
+ fucknugget cuntlick g@y @ss beotch pussys 's***' paedophile
91
+ pedophiles pedophile sucks licker lickers bitchface idiot tosser
92
+ idiots tossers
93
+ ].freeze
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,17 @@
1
+ require 'wikipedia/vandalism_detection/word_lists/bad'
2
+ require 'wikipedia/vandalism_detection/word_lists/biased'
3
+ require 'wikipedia/vandalism_detection/word_lists/pronouns'
4
+ require 'wikipedia/vandalism_detection/word_lists/sex'
5
+ require 'wikipedia/vandalism_detection/word_lists/vulgarism'
6
+ require 'wikipedia/vandalism_detection/word_lists/markup'
7
+
8
+ module Wikipedia
9
+ module VandalismDetection
10
+ module WordLists
11
+ # Returns an array of all wordlist words
12
+ def self.all
13
+ [*BAD, *BIASED, *PRONOUNS, *SEX, *VULGARISM].uniq!
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,29 @@
1
+ require 'wikipedia'
2
+ require 'wikipedia/vandalism_detection/version'
3
+ require 'wikipedia/vandalism_detection/configuration'
4
+ require 'wikipedia/vandalism_detection/exceptions'
5
+
6
+ require 'wikipedia/vandalism_detection/text'
7
+ require 'wikipedia/vandalism_detection/revision'
8
+ require 'wikipedia/vandalism_detection/edit'
9
+ require 'wikipedia/vandalism_detection/page'
10
+ require 'wikipedia/vandalism_detection/page_parser'
11
+ require 'wikipedia/vandalism_detection/revision_parser'
12
+
13
+ require 'wikipedia/vandalism_detection/word_lists'
14
+ require 'wikipedia/vandalism_detection/diff'
15
+ require 'wikipedia/vandalism_detection/wikitext_extractor'
16
+ require 'wikipedia/vandalism_detection/features'
17
+ require 'wikipedia/vandalism_detection/feature_calculator'
18
+
19
+ require 'wikipedia/vandalism_detection/instances'
20
+ require 'wikipedia/vandalism_detection/training_dataset'
21
+ require 'wikipedia/vandalism_detection/test_dataset'
22
+ require 'wikipedia/vandalism_detection/classifier'
23
+ require 'wikipedia/vandalism_detection/evaluator'
24
+
25
+ require 'weka/classifiers/meta/one_class_classifier'
26
+ require 'weka/classifiers/meta/real_ada_boost'
27
+ require 'weka/classifiers/trees/balanced_random_forest'
28
+
29
+ require 'weka/filters/supervised/instance/smote'
data/lib/wikipedia.rb ADDED
@@ -0,0 +1,41 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'timeout'
4
+
5
+ module Wikipedia
6
+ def self.api_base_uri
7
+ 'https://en.wikipedia.org/w/api.php?format=xml&action=query&'
8
+ end
9
+
10
+ def self.param_string(params)
11
+ params.map { |k, v| "#{k}=#{v}" }.join('&')
12
+ end
13
+
14
+ # Retries to call the request in the case of Timeout errors
15
+ def self.request_with_retry(uri, times = 1, timeout = 5)
16
+ content = ''
17
+
18
+ begin
19
+ Timeout.timeout(timeout) do
20
+ content = URI.parse(uri).read
21
+ end
22
+ rescue => error
23
+ if times > 0
24
+ times -= 1
25
+ retry
26
+ else
27
+ warn "Requesting '#{uri}' failed multiple times.\n#{error.message}"
28
+ end
29
+ end
30
+
31
+ content
32
+ end
33
+
34
+ def api_request(params = {})
35
+ uri = URI.encode(api_base_uri + param_string(params))
36
+ content = request_with_retry(uri, 3)
37
+ Nokogiri::XML(content)
38
+ end
39
+
40
+ module_function :api_request
41
+ end
@@ -0,0 +1,19 @@
1
+ FactoryBot.define do
2
+ factory :edit, class: Wikipedia::VandalismDetection::Edit do
3
+ old_revision { FactoryBot.build(:old_revision) }
4
+ new_revision { FactoryBot.build(:new_revision) }
5
+ page_id { nil }
6
+ page_title { nil }
7
+
8
+ initialize_with { new(old_revision, new_revision, page_id: page_id, page_title: page_title) }
9
+ end
10
+
11
+ factory :anonymous_edit, class: Wikipedia::VandalismDetection::Edit do
12
+ old_revision { FactoryBot.build(:old_revision) }
13
+ new_revision { FactoryBot.build(:anonymous_revision) }
14
+ page_id { nil }
15
+ page_title { nil }
16
+
17
+ initialize_with { new(old_revision, new_revision, page_id: page_id, page_title: page_title) }
18
+ end
19
+ end
@@ -0,0 +1,12 @@
1
+ FactoryBot.define do
2
+ factory :page, class: Wikipedia::VandalismDetection::Page do
3
+ id { nil }
4
+ title { nil }
5
+
6
+ after :build do |obj|
7
+ obj.add_revision FactoryBot.build(:old_revision, contributor: 'User')
8
+ obj.add_revision FactoryBot.build(:new_revision, contributor: 'User')
9
+ obj.add_revision FactoryBot.build(:even_newer_revision, contributor: 'User')
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,51 @@
1
+ FactoryBot.define do
2
+ factory :empty_revision, class: Wikipedia::VandalismDetection::Revision do |f|
3
+ f.id { nil }
4
+ f.parent_id { nil }
5
+ f.timestamp { nil }
6
+ f.text { Wikipedia::VandalismDetection::Text.new }
7
+ f.comment { Wikipedia::VandalismDetection::Text.new }
8
+ end
9
+
10
+ factory :old_revision, class: Wikipedia::VandalismDetection::Revision do |f|
11
+ f.id { '1' }
12
+ f.parent_id { nil }
13
+ f.timestamp { nil }
14
+ f.text { Wikipedia::VandalismDetection::Text.new('text 1') }
15
+ f.comment { Wikipedia::VandalismDetection::Text.new }
16
+ end
17
+
18
+ factory :new_revision, class: Wikipedia::VandalismDetection::Revision do |f|
19
+ f.id { '2' }
20
+ f.parent_id { '1' }
21
+ f.timestamp { '2014-11-27T18:00:00Z' }
22
+ f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
23
+ f.comment { Wikipedia::VandalismDetection::Text.new }
24
+ end
25
+
26
+ factory :even_newer_revision, class: Wikipedia::VandalismDetection::Revision do |f|
27
+ f.id { '3' }
28
+ f.parent_id { '2' }
29
+ f.timestamp { '2014-11-28T18:00:00Z' }
30
+ f.text { Wikipedia::VandalismDetection::Text.new('text 3') }
31
+ f.comment { Wikipedia::VandalismDetection::Text.new }
32
+ end
33
+
34
+ factory :anonymous_revision, class: Wikipedia::VandalismDetection::Revision do |f|
35
+ f.id { '2' }
36
+ f.parent_id { '1' }
37
+ f.timestamp { '2014-11-27T18:00:00Z' }
38
+ f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
39
+ f.comment { Wikipedia::VandalismDetection::Text.new }
40
+ f.contributor { '127.0.0.1' }
41
+ end
42
+
43
+ factory :registered_revision, class: Wikipedia::VandalismDetection::Revision do |f|
44
+ f.id { '2' }
45
+ f.parent_id { '1' }
46
+ f.timestamp { '2014-11-27T18:00:00Z' }
47
+ f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
48
+ f.comment { Wikipedia::VandalismDetection::Text.new }
49
+ f.contributor { '12345' }
50
+ end
51
+ end
@@ -0,0 +1,35 @@
1
+ corpora:
2
+ base_directory: ../../../../spec/resources/corpora
3
+
4
+ training:
5
+ base_directory: training
6
+ annotations_file: annotations.csv
7
+ edits_file: edits.csv
8
+ revisions_directory: revisions
9
+
10
+ test:
11
+ base_directory: test
12
+ edits_file: edits.csv
13
+ revisions_directory: revisions
14
+ ground_truth_file: ground-truth.txt
15
+
16
+ output:
17
+ base_directory: ../../../../spec/resources/build
18
+ training:
19
+ index_file: training_index.yml
20
+ arff_file: training.arff
21
+ test:
22
+ index_file: test_index.yml
23
+ arff_file: test.arff
24
+ classification_file: classification.txt
25
+
26
+ features:
27
+ - anonymity
28
+ - character sequence
29
+ - comment length
30
+
31
+ classifier:
32
+ type: Trees::RandomForest
33
+ options: -I 10
34
+ cross-validation-fold: 2
35
+ training-data-options: unbalanced
@@ -0,0 +1,8 @@
1
+ "editid","editor","oldrevisionid","newrevisionid","diffurl","edittime","editcomment","articleid","articletitle"
2
+ 1641,"137.163.16.199",328774088,328774188,"http://en.wikipedia.org/w/index.php?diff=328774188&oldid=328774088","2009-11-30T10:23:13Z","/* Location */",100935,"Not annotated article"
3
+ 1642,"J04n",307084144,326873205,"http://en.wikipedia.org/w/index.php?diff=326873205&oldid=307084144","2009-11-20T04:42:24Z","Repairing links to disambiguation pages - [[Wikipedia:Disambiguation pages with links|You can help!]]",19490449,"The Soundstage Sessions"
4
+ 1643,"64.186.73.198",326471754,326978767,"http://en.wikipedia.org/w/index.php?diff=326978767&oldid=326471754","2009-11-20T19:32:23Z","/* Non-electric telephones */",2193804,"Invention of the telephone"
5
+ 1644,"64.186.73.198",326471754,326978767,"http://en.wikipedia.org/w/index.php?diff=326978767&oldid=326471754","2009-11-20T19:32:23Z","/* Non-electric telephones */",2193804,"Invention of the telephone"
6
+ 1647,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
7
+ 1648,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
8
+ 1649,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
@@ -0,0 +1,3 @@
1
+ 307084144 326873205 R
2
+ 326471754 326978767 dunno
3
+ 328774035 328774110 V