wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,94 @@
1
+ require 'active_support/core_ext/string'
2
+ require 'active_support/core_ext/array'
3
+
4
+ require 'wikipedia/vandalism_detection/wikitext_extractor'
5
+ require 'wikipedia/vandalism_detection/features'
6
+ require 'wikipedia/vandalism_detection/edit'
7
+
8
+ module Wikipedia
9
+ module VandalismDetection
10
+ # This class provides methods for calculating a feature set of an edit.
11
+ # The features that shall be used can be defined in the config/wikipedia-vandalism-detection.yml file
12
+ # under the 'features:' root attribute like this:
13
+ #
14
+ # features:
15
+ # - anonymity
16
+ # - character sequence
17
+ # - ...
18
+ # etc.
19
+ class FeatureCalculator
20
+ def initialize
21
+ @features = Wikipedia::VandalismDetection.config.features
22
+ raise FeaturesNotConfiguredError if @features.blank? || @features.empty?
23
+ @feature_classes = build_feature_classes @features
24
+ end
25
+
26
+ # Calculates the configured festures for the given edit and returns an
27
+ # array of the computed values.
28
+ def calculate_features_for(edit)
29
+ raise ArgumentError, 'Input has to be an Edit.' unless edit.is_a?(Edit)
30
+
31
+ features = @feature_classes.map do |feature|
32
+ begin
33
+ feature.calculate(edit)
34
+ rescue WikitextExtractionError
35
+ $stderr.print %{
36
+ Edit (#{edit.old_revision.id}, #{edit.new_revision.id}) could not
37
+ be parsed by the WikitextExtractor and will be discarded.\n""}
38
+
39
+ Features::MISSING_VALUE
40
+ end
41
+ end
42
+
43
+ features
44
+ end
45
+
46
+ # Returns the calculated Numeric feature value for given edit and feature with given name
47
+ def calculate_feature_for(edit, feature_name)
48
+ unless edit.is_a?(Edit)
49
+ raise ArgumentError, 'First parameter has to be an Edit.'
50
+ end
51
+
52
+ unless feature_name.is_a?(String)
53
+ message = 'Second parameter has to be a feature name String ' \
54
+ '(e.g. "anonymity").'
55
+ raise ArgumentError, message
56
+ end
57
+
58
+ value = Features::MISSING_VALUE
59
+
60
+ begin
61
+ feature = feature_class_from_name(feature_name)
62
+ value = feature.calculate(edit)
63
+ rescue WikitextExtractionError
64
+ $stderr.print %{
65
+ Edit (#{edit.old_revision.id}, #{edit.new_revision.id}) could not
66
+ be parsed by the WikitextExtractor and will be discarded.\n""}
67
+ end
68
+
69
+ value
70
+ end
71
+
72
+ # Returns the feature names as defined in
73
+ # conf/wikipedia-vandalism-detection.yml under 'features:'.
74
+ def used_features
75
+ @features
76
+ end
77
+
78
+ private
79
+
80
+ # Returns an array of all configured Feature class instances.
81
+ def build_feature_classes(feature_names)
82
+ feature_names.map do |name|
83
+ feature_class_from_name(name)
84
+ end
85
+ end
86
+
87
+ # Returns the Feature class of the given name
88
+ def feature_class_from_name(name)
89
+ camelcased_name = name.split(/[\s-]/).map(&:capitalize!).join('')
90
+ "Wikipedia::VandalismDetection::Features::#{camelcased_name}".constantize.new
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,22 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes frequency of all wordlists words in the inserted
9
+ # text.
10
+ class AllWordlistsFrequency < FrequencyBase
11
+ # Returns the percentage of wordlists words in the inserted text.
12
+ # Returns 0.0 if inserted clean text is of zero length.
13
+ def calculate(edit)
14
+ super
15
+
16
+ text = Text.new(edit.inserted_words.join("\n")).clean
17
+ frequency(text, WordLists.all)
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/impact_base'
2
+ require 'wikipedia/vandalism_detection/word_lists'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the percentage by which the edit increases the
8
+ # number of all wordlists words in the text.
9
+ class AllWordlistsImpact < ImpactBase
10
+ def calculate(edit)
11
+ super
12
+
13
+ old_text = edit.old_revision.text.clean
14
+ new_text = edit.new_revision.text.clean
15
+
16
+ impact(old_text, new_text, WordLists.all)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,17 @@
1
+ require_relative 'base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature describes whether the contributor of the new revision is
7
+ # an anonymous or registered Wikipedia user.
8
+ class Anonymity < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ edit.new_revision.anonymous_contributor? ? 0 : 1
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,32 @@
1
+ require_relative 'base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature describes whether the contributor of the old revision is
7
+ # an anonymous or registered Wikipedia user.
8
+ class AnonymityPrevious < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ old_revision = edit.old_revision
13
+
14
+ if old_revision.contributor.blank?
15
+ xml = Wikipedia.api_request(
16
+ prop: 'revisions',
17
+ rvprop: 'user',
18
+ revids: old_revision.id
19
+ )
20
+
21
+ contributor = xml.xpath('//rev/@user').text
22
+ return Features::MISSING_VALUE if contributor.blank?
23
+
24
+ old_revision.contributor = contributor
25
+ end
26
+
27
+ old_revision.anonymous_contributor? ? 0 : 1
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,17 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the size of the edit's new revision text
7
+ # (article size).
8
+ class ArticleSize < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ edit.new_revision.text.size
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/bad'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes frequency of bad words in the inserted text.
9
+ class BadFrequency < FrequencyBase
10
+ # Returns the percentage of bad words in the inserted text.
11
+ # Returns 0.0 if inserted clean text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.inserted_words.join("\n")).clean
16
+ frequency(text, WordLists::BAD)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ require 'wikipedia/vandalism_detection/features/impact_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/bad'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the percentage by which the edit increases the
8
+ # number of bad words in the text.
9
+ class BadImpact < ImpactBase
10
+ def calculate(edit)
11
+ super
12
+ old_text = edit.old_revision.text.clean
13
+ new_text = edit.new_revision.text.clean
14
+
15
+ impact(old_text, new_text, WordLists::BAD)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,61 @@
1
+ require 'wikipedia'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ MISSING_VALUE = '?'.freeze
7
+
8
+ # This class should be the base class for all Wikipedia::Feature classes.
9
+ class Base
10
+ # Base method for feature calculation.
11
+ # This method should be overwritten in the concrete
12
+ # Wikipedia::Feature-classes.
13
+ #
14
+ # @example
15
+ # def calculate(edit)
16
+ # super # to handle ArgumentException
17
+ #
18
+ # ... concrete calculation of feature out of edit...
19
+ # end
20
+ def calculate(edit)
21
+ return if edit.is_a?(Edit)
22
+ raise ArgumentError, 'Passed argument has to be an Edit'
23
+ end
24
+
25
+ # Count the apperance of a given single term or multiple terms in the
26
+ # given text
27
+ #
28
+ # @param terms String
29
+ # @param options Hash of form { in: String }
30
+ #
31
+ # @example
32
+ # feature.count "and", in: text
33
+ # feature.count ["and", "or"], in: text
34
+ #
35
+ # @return Integer
36
+ def count(terms, options = {})
37
+ unless options[:in]
38
+ raise ArgumentError, 'The options hash must include the in: key'
39
+ end
40
+
41
+ unless terms.is_a?(String) || terms.is_a?(Array)
42
+ raise ArgumentError, 'The 1st arg should be an Array or String'
43
+ end
44
+
45
+ words = options[:in].downcase
46
+ freq = Hash.new(0)
47
+
48
+ words.gsub(/[\.,'{2,}:\!\?\(\)]/, '').split.each do |word|
49
+ freq[word.to_sym] += 1
50
+ end
51
+
52
+ if terms.is_a?(String)
53
+ freq[terms.downcase.to_sym]
54
+ else
55
+ terms.reduce(0) { |result, term| result + freq[term] }
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/biased'
3
+ require 'wikipedia/vandalism_detection/text'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature computes frequency of biased words in the inserted text.
9
+ class BiasedFrequency < FrequencyBase
10
+ # Returns the percentage of biased words in the inserted text.
11
+ # Returns 0.0 if inserted clean text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ text = Text.new(edit.inserted_words.join("\n")).clean
16
+ frequency(text, WordLists::BIASED)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/impact_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/biased'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the percentage by which the edit increases the
8
+ # number of biased words in the text.
9
+ class BiasedImpact < ImpactBase
10
+ def calculate(edit)
11
+ super
12
+
13
+ old_text = edit.old_revision.text.clean
14
+ new_text = edit.new_revision.text.clean
15
+
16
+ impact(old_text, new_text, WordLists::BIASED)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,26 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature returns whether the edit's is a blanking.
7
+ # size < 7, based on Mola Velasco 2010 implementation.
8
+ class Blanking < Base
9
+ BLANKING_THRESHOLD = 7
10
+
11
+ def calculate(edit)
12
+ super
13
+
14
+ old_text_size = edit.old_revision.text.size
15
+ new_text_size = edit.new_revision.text.size
16
+
17
+ text_removed = old_text_size > new_text_size
18
+ above_threshold = new_text_size < BLANKING_THRESHOLD
19
+
20
+ blanking = text_removed && above_threshold
21
+ blanking ? 1.0 : 0.0
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,23 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the character diversity of the edit's new revision inserted text.
7
+ # I.e. how many unique characters are amongst all inserted?
8
+ #
9
+ # Random typing leads to less unique characters relative to full length =>
10
+ class CharacterDiversity < Base
11
+ def calculate(edit)
12
+ super
13
+
14
+ inserted_letters = edit.inserted_text.scan(/[^\s]/)
15
+ all_letters_count = inserted_letters.size
16
+ unique_count = inserted_letters.uniq.size
17
+
18
+ all_letters_count**(1.0 / unique_count)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,19 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the length of the longest sequence of the same
7
+ # character in the inserted text.
8
+ class CharacterSequence < Base
9
+ def calculate(edit)
10
+ super
11
+
12
+ sequence_hash = edit.inserted_text.scan(/((.)\2*)/)
13
+ sequence_hash = sequence_hash.group_by { |seq, _| seq.length }
14
+ sequence_hash.empty? ? 0 : sequence_hash.max.first
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/bad'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes frequency of bad words in the comment of the
8
+ # edit's new revision.
9
+ class CommentBadFrequency < FrequencyBase
10
+ # Returns the percentage of bad words in the new revision's comment.
11
+ # Returns 0.0 if text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ comment = edit.new_revision.comment.clean
16
+ frequency(comment, WordLists::BAD)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/biased'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes frequency of biased words in the comment of the
8
+ # edit's new revision.
9
+ class CommentBiasedFrequency < FrequencyBase
10
+ # Returns the percentage of biased words in the new revision's comment.
11
+ # Returns 0.0 if text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ comment = edit.new_revision.comment.clean
16
+ frequency(comment, WordLists::BIASED)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,16 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature computes the byte length of the edit's new revision's
7
+ # comment.
8
+ class CommentLength < Base
9
+ def calculate(edit)
10
+ super
11
+ edit.new_revision.comment.clean.bytesize
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,30 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'wikipedia/vandalism_detection/word_lists/markup'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes frequency of markup words in the comment of the
8
+ # edit's new revision.
9
+ class CommentMarkupFrequency < Base
10
+ MARKUP_REGEX = /(#{WordLists::MARKUP.join('|')})/
11
+
12
+ # Returns the percentage of markup words in the new revision's comment.
13
+ # Returns 0.0 if text is of zero length.
14
+ def calculate(edit)
15
+ super
16
+
17
+ comment = edit.new_revision.comment
18
+ all_words_count = comment.split.count
19
+ markup_words_count = comment.scan(MARKUP_REGEX).count
20
+
21
+ if all_words_count > 0
22
+ markup_words_count.to_f / all_words_count.to_f
23
+ else
24
+ 0.0
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/pronouns'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes the frequency of pronouns in the comment of the
8
+ # new revision.
9
+ class CommentPronounFrequency < FrequencyBase
10
+ # Returns the percentage of pronoun words in the new revision's
11
+ # comment. Returns 0.0 if text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ comment = edit.new_revision.comment.clean
16
+ frequency(comment, WordLists::PRONOUNS)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/sex'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes frequency of sex words in the comment of the
8
+ # edit's new revision.
9
+ class CommentSexFrequency < FrequencyBase
10
+ # Returns the percentage of sex words in the new revision's comment.
11
+ # Returns 0.0 if text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ comment = edit.new_revision.comment.clean
16
+ frequency(comment, WordLists::SEX)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ require 'wikipedia/vandalism_detection/features/frequency_base'
2
+ require 'wikipedia/vandalism_detection/word_lists/vulgarism'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ module Features
7
+ # This feature computes frequency of vulgarism words in the comment of the
8
+ # edit's new revision.
9
+ class CommentVulgarismFrequency < FrequencyBase
10
+ # Returns the percentage of vulgarism words in the new revision's
11
+ # comment.Returns 0.0 if text is of zero length.
12
+ def calculate(edit)
13
+ super
14
+
15
+ comment = edit.new_revision.comment.clean
16
+ frequency(comment, WordLists::VULGARISM)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,31 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+ require 'zlib'
3
+ require 'wikipedia/vandalism_detection/diff'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ module Features
8
+ # This feature describes compressibility ratio of compressed and
9
+ # uncompressed inserted text.
10
+ class Compressibility < Base
11
+ # Calculates the compressibility ratio of the inserted text.
12
+ # Values above 0.5 are higher compressed and therefor can stand for
13
+ # nonsense text as:
14
+ # 'AAAAAAAAAAAAAAAAAAAhhhhhhhhhhhhhhhh!' etc.
15
+ def calculate(edit)
16
+ super
17
+
18
+ inserted_text = edit.inserted_text
19
+ uncompressed_size = inserted_text.bytesize.to_f
20
+ compressed_size = Zlib::Deflate.deflate(inserted_text).bytesize.to_f
21
+
22
+ if inserted_text.empty?
23
+ 0.5
24
+ else
25
+ uncompressed_size / (compressed_size + uncompressed_size)
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,16 @@
1
+ require 'wikipedia/vandalism_detection/features/base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ class ContainsBase < Base
7
+ # Returns whether the comment contains the given term.
8
+ # It returns 0 if term is not included, else 1.
9
+ def contains(comment, terms)
10
+ terms = terms.is_a?(Array) ? terms.join('|') : terms
11
+ comment =~ /#{terms}/i ? 1 : 0
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,17 @@
1
+ require 'wikipedia/vandalism_detection/features/contains_base'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Features
6
+ # This feature returns whether the edit's comment includes reverted key
7
+ # words.
8
+ class Reverted < ContainsBase
9
+ def calculate(edit)
10
+ super
11
+
12
+ contains(edit.new_revision.comment, %w[rvt rvv revert])
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end