wikipedia-vandalism_detection 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,80 @@
1
+ # The WikitextExtractor imports the WikitextExtractor class from the
2
+ # sweble-wikitext-extractor.jar
3
+ # The sweble-wikitext-extractor.jar is a custom Java project which uses the
4
+ # Sweble wikitext parser to extract plaintext out of wikimarkup text.
5
+ #
6
+ # The Sweble WikitextExtractor currently depends on the swc-engine -v1.1.0 with
7
+ # dependencies,
8
+ # see: http://sweble.org/downloads/swc-devel/master-latest/ to download it.
9
+ #
10
+ # The Java source code can be found on:
11
+ # webis.uni-weimar.de:/srv/cvsroot/code-in-progress/wikipedia-vandalism-detection/sweble-wikitext-extractor
12
+ module Wikipedia
13
+ module VandalismDetection
14
+ require 'java'
15
+ require 'java/swc-engine-1.1.0-jar-with-dependencies.jar'
16
+ require 'java/sweble-wikitext-extractor.jar'
17
+
18
+ java_import 'de.webis.sweble.WikitextExtractor'
19
+
20
+ class WikitextExtractionError < StandardError; end
21
+
22
+ # This class wrapps the de.webis.sweble.WikitextExtractor Java class and
23
+ # provides methods to extract plaintext from wiki markup text both space
24
+ # preserving and cleaned without line breaks and whitespace.
25
+ class WikitextExtractor
26
+ REDIRECT = '#REDIRECT'.freeze
27
+
28
+ class << self
29
+ # Returns the extracted text from the given wiki markup preserving
30
+ # spacing with added section numbers.
31
+ def extract(wiki_text)
32
+ wiki_text = wiki_text.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
33
+ wiki_text = wiki_text.gsub(REDIRECT, '')
34
+
35
+ WikitextExtractor.new.extract(wiki_text)
36
+ rescue => exception
37
+ message = "Wikitext extraction failed: \n#{exception.message}"
38
+ raise WikitextExtractionError, message, caller
39
+ end
40
+
41
+ # Returns the cleaned extracted text from the given wiki markup.
42
+ # Cleaned means a single string without breaks, multiple spaces and
43
+ # section numbers.
44
+ def extract_clean(wiki_text)
45
+ wiki_text = extract wiki_text
46
+
47
+ wiki_text = remove_section_numbering_from wiki_text
48
+ wiki_text = remove_line_breaks_from wiki_text
49
+ wiki_text = remove_uris_from wiki_text
50
+ wiki_text = remove_special_signes_from wiki_text
51
+ wiki_text = remove_multiple_spaces_from wiki_text
52
+ wiki_text.strip
53
+ end
54
+
55
+ private
56
+
57
+ # removes 1., 1.1., 2.3.4. etc. at the beginning of a line
58
+ def remove_section_numbering_from(text)
59
+ text.gsub(/^(\d\.)+/, '')
60
+ end
61
+
62
+ def remove_line_breaks_from(text)
63
+ text.gsub(/\n+/, ' ')
64
+ end
65
+
66
+ def remove_multiple_spaces_from(text)
67
+ text.gsub(/\s+/, ' ')
68
+ end
69
+
70
+ def remove_uris_from(text)
71
+ text.gsub(%r{(https?|ftp)\s?:\s?\/\/[^\s\/$.?#].[^\s]*}i, '')
72
+ end
73
+
74
+ def remove_special_signes_from(text)
75
+ text.gsub(/\[\]\{\}\|\=/, ' ')
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,11 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ BAD = %i[
5
+ 666 da dont dosent whatever guy hi nazi sup guise loser thats ugly wanna
6
+ whats wont gotta bloody fart pot prick stink smells smelly alot dunno
7
+ gotcha
8
+ ].freeze
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,20 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ BIASED = %i[
5
+ acclaimed amazing astonishing authoritative beautiful best brilliant
6
+ canonical celebrated charismatic classic cutting-edge defining
7
+ definitive eminent enigma exciting extraordinary fabulous famous
8
+ infamous fantastic fully genius global great greatest iconic immensely
9
+ impactful incendiary indisputable influential innovative inspired
10
+ intriguing leader leading legendary major masterly mature memorable
11
+ notable outstanding pioneer popular prestigious really remarkable
12
+ renowned respected seminal significant skillful solution single-handedly
13
+ staunch talented most top transcendent undoubtedly unique visionary
14
+ virtually virtuoso well-known well-established world-class worst coolest
15
+ super probably hate ugly fat lame weird strange everyone cares boring
16
+ boreing ever huge like idiotic absolute total totally
17
+ ].freeze
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,26 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ EMOTICONS = [
5
+ :':\)', :':p', :':\(', :';\)', :':D', :';D', :';P', :';p', :':\-\)',
6
+ :':\-\(', :';\-\)', :':\-D', :':\-p', :':\-P', :'8\-\)', :'8\)',
7
+ :'\^\^', :'\*_\*', :'\^_\^', :':\-I', :':\-X', :':\-x', :'X\-p',
8
+ :'X\-P', :':\-\]', :'\^\.\^', :':\*', :':\-\*', :XD, :'X\-D', :'8\-D',
9
+ :'8D', :':\-O', :':\-o', :':\-\|', :'X\-\(', :'X\(', :'\-_\-', :':o\)',
10
+ :':O\)', :'B\-\)', :':O', :':o', :':\-s', :':\-S', :':\-\/', :':\-\\',
11
+ :T_T, :':\*\(', :':\*\-\(', :':\(\(', :'\*\-\*', :':\-\[', :':\->',
12
+ :':\|', :':\-\|', :':\]', :':\[', :'\/:\(', :'\\:\(', :':\-$', :':$',
13
+ :':\-6', :':\-9', :'@_@', :'<3', :'\|\-D', :':0', :':\-0', :o_O,
14
+ :oO, :'\(\-:', :'\(\-;', :'\(:', :'\):', :'\)\-:', :'\(;', :'\(y\)',
15
+ :'\(\.\)\(\.\)', :O_O, :'0_0', :'8\-\[', :'8\-\]', :'8\[', :'8\]',
16
+ :'8\-\(', :'8\(', :':\-', :'%\)', :'%\-\)', :'8\|', :'8\-\|', :'=\)',
17
+ :':\]', :':>', :':c\)', :'\[:', :'<:', :'c:', :'\(x', :'\(o:', :'\(c:',
18
+ :'D:', :':\'\(', :':\'C', :';\(', :';o\)', :'\(o;', :':b', :':p', :'=P',
19
+ :':P', :dx, :xP, :'d\-:', :'d:', :'q:', :'d=', :'d;', :'c\(:', :'=D',
20
+ :'=\-D', :'=O', :'=o', :'=0', :'o=', :'O=', :'0=', :'\^_~', :'>_<',
21
+ :'~_~', :'>:', :':<', :'\(Y\)', :'\(=', :'\)=', :'=\(', :'=\[, :=\]',
22
+ :'\[=', :'\]='
23
+ ].freeze
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,19 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ MARKUP = [
5
+ :'\{\{',
6
+ :'\[\[',
7
+ :infobox,
8
+ :category,
9
+ :defaultsort,
10
+ :'<ref>',
11
+ :cite,
12
+ :__toc__,
13
+ :__forcetoc__,
14
+ :defaultsort,
15
+ :reflist
16
+ ].freeze
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,11 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ PRONOUNS = %i[
5
+ i me myself mine my we us ourselves ourself ours our you yourself yours
6
+ your thou thee thyself thine thy yourselves y'all youse you-uns y'all
7
+ youse yous yis yourselves y'all's selves yous's
8
+ ].freeze
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,10 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ SEX = %i[
5
+ vagina sex anal penis breast breasts buttocks sodomy sodomized erect
6
+ nipple nipples vibrator vibrator dildo dildos
7
+ ].freeze
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,96 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ module WordLists
4
+ # This list is taken from https://github.com/snipe/banbuilder and can be
5
+ # downloaded from:
6
+ # https //:github.com/snipe/banbuilder/blob/master/word-dbs/wordlist.csv
7
+ VULGARISM = %i[
8
+ $#!+ $1ut $h1t $hit $lut 'ho 'hobag a$$ anus ass assmunch b1tch
9
+ ballsack bastard beaner beastiality biatch beeyotch bitchy
10
+ blow blowjob bollock bollocks bollok boner boob bugger buttplug
11
+ c-0-c-k c-o-c-k c-u-n-t c.0.c.k c.o.c.k. c.u.n. jerk jackoff
12
+ jackhole j3rk0ff homo hom0 hobag hell h0mo h0m0 goddamn goddammit
13
+ godamnit ghey ghay gfy gay fudgepacker fuckwad fucktard fuckoff
14
+ fucker fuck-tard fuck fellatio fellate felching felcher felch
15
+ fartknocker fart fannybandit fanny faggot fagg fag f.u.c.k f-u-c-k
16
+ dyke douchebag douche douch3 doosh dike dick damnit damn dammit d1ldo
17
+ d1ld0 d1ck d0uche d0uch3 cunt cumstain cum crap coon cock clitoris
18
+ clit cl1t cawk c0ck jerk0ff jerkoff jizz knobend labia lmfao moolie
19
+ muff nigga nigger p.u.s.s.y. piss piss-off pissoff prick pube pussy
20
+ queer retard retarded s-h-1-t s-h-i-t s.h.i.t. scrotum sh1t shit slut
21
+ smegma t1t tard terd tit tits titties turd twat vag wank wetback
22
+ whore whoreface 'f*ck' sh*t pu$$y p*ssy diligaf wtf stfu fu*ck fack
23
+ shite fxck sh!t @sshole assh0le assho!e a$$hole a$$h0le a$$h0!e
24
+ a$$h01e assho1e wh0re f@g f@gg0t f@ggot motherf*cker mofo cuntlicker
25
+ cuntface dickbag cockknocker beatch fucknut nucking futs mams cunny
26
+ quim clitty kike spic wop chink humper feltch feltcher fvck ahole
27
+ nads spick douchey bullturds gonads bitch butt fellatio lmao s-o-b
28
+ spunk he11 jizm jism bukkake shiz wigger gook ritard reetard
29
+ masterbate masturbate goatse masterbating masturbating hitler nazi
30
+ tubgirl gtfo foad r-tard rtard hoor g-spot gspot vulva assmaster
31
+ viagra phuck frack fuckwit assbang assbanged assbangs asshole
32
+ assholes asswipe asswipes b1tch bastards bitched bitches boners
33
+ bullshit bullshits bullshitted cameltoe chinc chincs chink chode
34
+ chodes clit clits cocks coons cumming cunts d1ck dickhead dickheads
35
+ doggie-style douchebags dumass dumbass dumbasses dykes faggit fags
36
+ fucked fucker fuckface fucks godamnit gooks humped humping jackass
37
+ jap japs jerk jizzed kikes knobend kooch kooches kootch fuckers
38
+ motherfucking niggah niggas niggers p.u.s.s.y. pussies queers rim s0b
39
+ shitface shithead shits shitted s.o.b. spik spiks twats whack whores
40
+ zoophile m-fucking mthrfucking muthrfucking mutherfucking
41
+ mutherfucker mtherfucker mthrfucker mthrf*cker whorehopper copulator
42
+ whoralicious whorealicious aeolus analprobe areola areole aryan arian
43
+ asses assfuck azazel baal babes bang banger barf bawdy beardedclam
44
+ beater beaver beer bigtits bimbo blew blow blowjobs blowup bod bodily
45
+ boink bone boned bong boobies boobs booby booger bookie booky bootee
46
+ bootie booty booze boozer boozy bosom bosomy bowel bowels bra
47
+ brassiere bung babe bush buttfuck cocaine kinky klan panties
48
+ pedophile pedophilia pedophiliac punkass queaf rape scantily essohbee
49
+ shithouse smut snatch toots doggie anorexia bulimia bulimiic burp
50
+ busty buttfucker caca cahone carnal carpetmuncher cervix climax
51
+ cocain cocksucker coital coke commie condom corpse coven crabs crack
52
+ crackwhore crappy cuervo cummin cumshot cumshots cunnilingus dago
53
+ dagos damned dick-ish dickish dickweed anorexic prostitute marijuana
54
+ lsd pcp diddle dawgie-style dimwit dingle doofus dopey douche drunk
55
+ dummy ejaculate enlargement erect erotic exotic extacy extasy faerie
56
+ faery fagged fagot fairy fisted fisting fisty floozy fondle foobar
57
+ foreskin frigg frigga fubar fucking fuckup ganja gays glans godamn
58
+ goddam goldenshower gonad gonads handjob hebe hemp heroin herpes
59
+ hijack hiv homey honky hooch hookah hooker hootch hooter hooters hump
60
+ hussy hymen inbred incest injun jerked jiz jizm horny junkie junky
61
+ kill kkk kraut kyke lech leper lesbians lesbos lez lezbian lezbians
62
+ lezbo lezbos lezzie lezzies lezzy loin loins lube lust lusty massa
63
+ masterbation masturbation maxi menses menstruate menstruation meth
64
+ molest moron motherfucka motherfucker murder muthafucker nad naked
65
+ napalm nappy nazism negro niggle nimrod ninny nooky nympho opiate
66
+ opium oral orally organ orgasm orgies orgy ovary ovum ovums paddy
67
+ pantie panty pastie pasty pecker pedo pee peepee penetrate
68
+ penetration penial penile perversion peyote phalli phallic
69
+ pillowbiter pimp pinko pissed pms polack porn porno pornography pot
70
+ potty prig prude pubic pubis punky puss queef queefing quife quicky
71
+ racist racy raped raper rapist raunch rectal rectum rectus reefer
72
+ reich revue risque rum rump sadism sadist satan scag schizo screw
73
+ screwed scrog scrot scrote scrud scum seaman seamen seduce semen
74
+ sex_story sexual shithole shitter shitty s*o*b sissy skag slave
75
+ sleaze sleazy sluts smutty sniper snuff sodom souse soused sperm
76
+ spooge stab steamy stiffy stoned strip stroke whacking suck sucked
77
+ sucking tampon tawdry teat teste testee testes testis thrust thug
78
+ tinkle titfuck titi titty whacked toke tramp trashy tush undies unwed
79
+ urinal urine uterus uzi valium virgin vixen vodka vomit voyeur vulgar
80
+ wad wazoo wedgie weed weenie weewee weiner weirdo wench whitey whiz
81
+ whored whorehouse whoring womb woody x-rated xxx b@lls yeasty yobbo
82
+ sumofabiatch doggy-style doggy wang dong d0ng w@ng wh0reface
83
+ wh0ref@ce wh0r3f@ce tittyfuck tittyfucker tittiefucker cockholster
84
+ cockblock gai gey faig faigt a55 a55hole gae corksucker rumprammer
85
+ slutdumper niggaz muthafuckaz gigolo pussypounder herp herpy
86
+ transsexual orgasmic cunilingus anilingus dickdipper dickwhipper
87
+ dicksipper dickripper dickflipper dickzipper homoey queero freex
88
+ cunthunter shamedame slutkiss shiteater fuckass fucka$$ clitorus
89
+ assfucker assfuckers dillweed cracker teabagging shitt azz fuk
90
+ fucknugget cuntlick g@y @ss beotch pussys 's***' paedophile
91
+ pedophiles pedophile sucks licker lickers bitchface idiot tosser
92
+ idiots tossers
93
+ ].freeze
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,17 @@
1
+ require 'wikipedia/vandalism_detection/word_lists/bad'
2
+ require 'wikipedia/vandalism_detection/word_lists/biased'
3
+ require 'wikipedia/vandalism_detection/word_lists/pronouns'
4
+ require 'wikipedia/vandalism_detection/word_lists/sex'
5
+ require 'wikipedia/vandalism_detection/word_lists/vulgarism'
6
+ require 'wikipedia/vandalism_detection/word_lists/markup'
7
+
8
+ module Wikipedia
9
+ module VandalismDetection
10
+ module WordLists
11
+ # Returns an array of all wordlist words
12
+ def self.all
13
+ [*BAD, *BIASED, *PRONOUNS, *SEX, *VULGARISM].uniq!
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,29 @@
1
+ require 'wikipedia'
2
+ require 'wikipedia/vandalism_detection/version'
3
+ require 'wikipedia/vandalism_detection/configuration'
4
+ require 'wikipedia/vandalism_detection/exceptions'
5
+
6
+ require 'wikipedia/vandalism_detection/text'
7
+ require 'wikipedia/vandalism_detection/revision'
8
+ require 'wikipedia/vandalism_detection/edit'
9
+ require 'wikipedia/vandalism_detection/page'
10
+ require 'wikipedia/vandalism_detection/page_parser'
11
+ require 'wikipedia/vandalism_detection/revision_parser'
12
+
13
+ require 'wikipedia/vandalism_detection/word_lists'
14
+ require 'wikipedia/vandalism_detection/diff'
15
+ require 'wikipedia/vandalism_detection/wikitext_extractor'
16
+ require 'wikipedia/vandalism_detection/features'
17
+ require 'wikipedia/vandalism_detection/feature_calculator'
18
+
19
+ require 'wikipedia/vandalism_detection/instances'
20
+ require 'wikipedia/vandalism_detection/training_dataset'
21
+ require 'wikipedia/vandalism_detection/test_dataset'
22
+ require 'wikipedia/vandalism_detection/classifier'
23
+ require 'wikipedia/vandalism_detection/evaluator'
24
+
25
+ require 'weka/classifiers/meta/one_class_classifier'
26
+ require 'weka/classifiers/meta/real_ada_boost'
27
+ require 'weka/classifiers/trees/balanced_random_forest'
28
+
29
+ require 'weka/filters/supervised/instance/smote'
data/lib/wikipedia.rb ADDED
@@ -0,0 +1,41 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'timeout'
4
+
5
+ module Wikipedia
6
+ def self.api_base_uri
7
+ 'https://en.wikipedia.org/w/api.php?format=xml&action=query&'
8
+ end
9
+
10
+ def self.param_string(params)
11
+ params.map { |k, v| "#{k}=#{v}" }.join('&')
12
+ end
13
+
14
+ # Retries to call the request in the case of Timeout errors
15
+ def self.request_with_retry(uri, times = 1, timeout = 5)
16
+ content = ''
17
+
18
+ begin
19
+ Timeout.timeout(timeout) do
20
+ content = URI.parse(uri).read
21
+ end
22
+ rescue => error
23
+ if times > 0
24
+ times -= 1
25
+ retry
26
+ else
27
+ warn "Requesting '#{uri}' failed multiple times.\n#{error.message}"
28
+ end
29
+ end
30
+
31
+ content
32
+ end
33
+
34
+ def api_request(params = {})
35
+ uri = URI.encode(api_base_uri + param_string(params))
36
+ content = request_with_retry(uri, 3)
37
+ Nokogiri::XML(content)
38
+ end
39
+
40
+ module_function :api_request
41
+ end
@@ -0,0 +1,19 @@
1
+ FactoryBot.define do
2
+ factory :edit, class: Wikipedia::VandalismDetection::Edit do
3
+ old_revision { FactoryBot.build(:old_revision) }
4
+ new_revision { FactoryBot.build(:new_revision) }
5
+ page_id { nil }
6
+ page_title { nil }
7
+
8
+ initialize_with { new(old_revision, new_revision, page_id: page_id, page_title: page_title) }
9
+ end
10
+
11
+ factory :anonymous_edit, class: Wikipedia::VandalismDetection::Edit do
12
+ old_revision { FactoryBot.build(:old_revision) }
13
+ new_revision { FactoryBot.build(:anonymous_revision) }
14
+ page_id { nil }
15
+ page_title { nil }
16
+
17
+ initialize_with { new(old_revision, new_revision, page_id: page_id, page_title: page_title) }
18
+ end
19
+ end
@@ -0,0 +1,12 @@
1
+ FactoryBot.define do
2
+ factory :page, class: Wikipedia::VandalismDetection::Page do
3
+ id { nil }
4
+ title { nil }
5
+
6
+ after :build do |obj|
7
+ obj.add_revision FactoryBot.build(:old_revision, contributor: 'User')
8
+ obj.add_revision FactoryBot.build(:new_revision, contributor: 'User')
9
+ obj.add_revision FactoryBot.build(:even_newer_revision, contributor: 'User')
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,51 @@
1
+ FactoryBot.define do
2
+ factory :empty_revision, class: Wikipedia::VandalismDetection::Revision do |f|
3
+ f.id { nil }
4
+ f.parent_id { nil }
5
+ f.timestamp { nil }
6
+ f.text { Wikipedia::VandalismDetection::Text.new }
7
+ f.comment { Wikipedia::VandalismDetection::Text.new }
8
+ end
9
+
10
+ factory :old_revision, class: Wikipedia::VandalismDetection::Revision do |f|
11
+ f.id { '1' }
12
+ f.parent_id { nil }
13
+ f.timestamp { nil }
14
+ f.text { Wikipedia::VandalismDetection::Text.new('text 1') }
15
+ f.comment { Wikipedia::VandalismDetection::Text.new }
16
+ end
17
+
18
+ factory :new_revision, class: Wikipedia::VandalismDetection::Revision do |f|
19
+ f.id { '2' }
20
+ f.parent_id { '1' }
21
+ f.timestamp { '2014-11-27T18:00:00Z' }
22
+ f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
23
+ f.comment { Wikipedia::VandalismDetection::Text.new }
24
+ end
25
+
26
+ factory :even_newer_revision, class: Wikipedia::VandalismDetection::Revision do |f|
27
+ f.id { '3' }
28
+ f.parent_id { '2' }
29
+ f.timestamp { '2014-11-28T18:00:00Z' }
30
+ f.text { Wikipedia::VandalismDetection::Text.new('text 3') }
31
+ f.comment { Wikipedia::VandalismDetection::Text.new }
32
+ end
33
+
34
+ factory :anonymous_revision, class: Wikipedia::VandalismDetection::Revision do |f|
35
+ f.id { '2' }
36
+ f.parent_id { '1' }
37
+ f.timestamp { '2014-11-27T18:00:00Z' }
38
+ f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
39
+ f.comment { Wikipedia::VandalismDetection::Text.new }
40
+ f.contributor { '127.0.0.1' }
41
+ end
42
+
43
+ factory :registered_revision, class: Wikipedia::VandalismDetection::Revision do |f|
44
+ f.id { '2' }
45
+ f.parent_id { '1' }
46
+ f.timestamp { '2014-11-27T18:00:00Z' }
47
+ f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
48
+ f.comment { Wikipedia::VandalismDetection::Text.new }
49
+ f.contributor { '12345' }
50
+ end
51
+ end
@@ -0,0 +1,35 @@
1
+ corpora:
2
+ base_directory: ../../../../spec/resources/corpora
3
+
4
+ training:
5
+ base_directory: training
6
+ annotations_file: annotations.csv
7
+ edits_file: edits.csv
8
+ revisions_directory: revisions
9
+
10
+ test:
11
+ base_directory: test
12
+ edits_file: edits.csv
13
+ revisions_directory: revisions
14
+ ground_truth_file: ground-truth.txt
15
+
16
+ output:
17
+ base_directory: ../../../../spec/resources/build
18
+ training:
19
+ index_file: training_index.yml
20
+ arff_file: training.arff
21
+ test:
22
+ index_file: test_index.yml
23
+ arff_file: test.arff
24
+ classification_file: classification.txt
25
+
26
+ features:
27
+ - anonymity
28
+ - character sequence
29
+ - comment length
30
+
31
+ classifier:
32
+ type: Trees::RandomForest
33
+ options: -I 10
34
+ cross-validation-fold: 2
35
+ training-data-options: unbalanced
@@ -0,0 +1,8 @@
1
+ "editid","editor","oldrevisionid","newrevisionid","diffurl","edittime","editcomment","articleid","articletitle"
2
+ 1641,"137.163.16.199",328774088,328774188,"http://en.wikipedia.org/w/index.php?diff=328774188&oldid=328774088","2009-11-30T10:23:13Z","/* Location */",100935,"Not annotated article"
3
+ 1642,"J04n",307084144,326873205,"http://en.wikipedia.org/w/index.php?diff=326873205&oldid=307084144","2009-11-20T04:42:24Z","Repairing links to disambiguation pages - [[Wikipedia:Disambiguation pages with links|You can help!]]",19490449,"The Soundstage Sessions"
4
+ 1643,"64.186.73.198",326471754,326978767,"http://en.wikipedia.org/w/index.php?diff=326978767&oldid=326471754","2009-11-20T19:32:23Z","/* Non-electric telephones */",2193804,"Invention of the telephone"
5
+ 1644,"64.186.73.198",326471754,326978767,"http://en.wikipedia.org/w/index.php?diff=326978767&oldid=326471754","2009-11-20T19:32:23Z","/* Non-electric telephones */",2193804,"Invention of the telephone"
6
+ 1647,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
7
+ 1648,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
8
+ 1649,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
@@ -0,0 +1,3 @@
1
+ 307084144 326873205 R
2
+ 326471754 326978767 dunno
3
+ 328774035 328774110 V