wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,12 @@
1
+ require 'wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Algorithms
6
+ def kullback_leibler_divergence(text_a, text_b)
7
+ divergence = KullbackLeiblerDivergence.new
8
+ divergence.of(text_a, text_b)
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,202 @@
1
+ require 'weka'
2
+ require 'active_support/core_ext/string'
3
+ require 'fileutils'
4
+
5
+ require 'wikipedia/vandalism_detection/configuration'
6
+ require 'wikipedia/vandalism_detection/edit'
7
+ require 'wikipedia/vandalism_detection/feature_calculator'
8
+ require 'wikipedia/vandalism_detection/instances'
9
+ require 'wikipedia/vandalism_detection/evaluator'
10
+
11
+ module Wikipedia
12
+ module VandalismDetection
13
+ class Classifier
14
+ attr_reader :evaluator, :dataset
15
+
16
+ # Loads the classifier instance configured in the config file.
17
+ def initialize(dataset = nil)
18
+ @config = Wikipedia::VandalismDetection.config
19
+ @feature_calculator = FeatureCalculator.new
20
+ @classifier = load_classifier(dataset)
21
+ @evaluator = Evaluator.new(self)
22
+ end
23
+
24
+ # Returns the concrete classifier instance configured in the config file
25
+ # When you configured a Trees::RandomForest classifier you will get a
26
+ # Weka::Classifiers::Trees::RandomForest instance.
27
+ # This instance can be used for native function callings of the classifier
28
+ # class.
29
+ def classifier_instance
30
+ @classifier
31
+ end
32
+
33
+ # Classifies an edit or a set of features and returns the vandalism
34
+ # confidence by default.
35
+ # If option 'return_all_params: true' is set, it returns a Hash of form
36
+ # { confidence => ..., class_index => ...}
37
+ #
38
+ # @example
39
+ # # suppose you have a dataset with 2 feature or 'edit' as an instance
40
+ # # of Wikipedia::VandalismDetection::Edit
41
+ # classifier = Wikipedia::VandalsimDetection::Classifier.new
42
+ # features = [0.45, 0.67]
43
+ #
44
+ # confidence = classifier.classify(features)
45
+ # confidence = classifier.classify(edit)
46
+ def classify(edit_or_features, options = {})
47
+ features = @config.features
48
+ param_is_features = edit_or_features.is_a?(Array) && edit_or_features.size == features.count
49
+ param_is_edit = edit_or_features.is_a? Edit
50
+
51
+ unless param_is_edit || param_is_features
52
+ message = 'Input has to be an Edit or an Array of feature values.'
53
+ raise ArgumentError, message
54
+ end
55
+
56
+ feature_values = param_is_edit ? @feature_calculator.calculate_features_for(edit_or_features) : edit_or_features
57
+ return -1.0 if feature_values.empty?
58
+
59
+ feature_values = feature_values.map do |i|
60
+ i == Features::MISSING_VALUE ? nil : i
61
+ end
62
+
63
+ dataset = Instances.empty
64
+ dataset.set_class_index(feature_values.count)
65
+ dataset.add_instance([*feature_values, Instances::VANDALISM])
66
+
67
+ instance = dataset.instance(0)
68
+ instance.set_class_missing
69
+
70
+ if @config.use_occ?
71
+ if @config.classifier_options =~ /#{Instances::VANDALISM}/
72
+ index = Instances::VANDALISM_CLASS_INDEX
73
+ else
74
+ index = Instances::REGULAR_CLASS_INDEX
75
+ end
76
+ else
77
+ index = Instances::VANDALISM_CLASS_INDEX
78
+ end
79
+
80
+
81
+ confidence = @classifier.distribution_for_instance(instance).to_a[index]
82
+
83
+ if options[:return_all_params]
84
+ class_index = @classifier.classify_instance(instance)
85
+ class_index = class_index.nan? ? Instances::NOT_KNOWN_INDEX : class_index.to_i
86
+ results = { confidence: confidence, class_index: class_index }
87
+ else
88
+ results = confidence
89
+ end
90
+
91
+ results
92
+ end
93
+
94
+ # Cross validates the classifier.
95
+ # Fold is used as defined in configuration (default is 10).
96
+ #
97
+ # @example
98
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
99
+ # evaluation = classifier.cross_validate
100
+ # evaluation = classifier.cross_validate(equally_distributed: true)
101
+ #
102
+ def cross_validate(options = {})
103
+ @evaluator.cross_validate(options)
104
+ end
105
+
106
+ private
107
+
108
+ # Loads the (Weka-) Classifier set in the Configuration
109
+ def load_classifier(dataset)
110
+ classifier_name = @config.classifier_type
111
+
112
+ unless classifier_name
113
+ message = 'Classifier type is not defined in wikipedia-vandalism-detection.yml'
114
+ raise ClassifierNotConfiguredError, message
115
+ end
116
+
117
+ if @config.features.blank?
118
+ message = 'No features configured in wikipedia-vandalism-detection.yml'
119
+ raise FeaturesNotConfiguredError, message
120
+ end
121
+
122
+ begin
123
+ "Weka::Classifiers::#{classifier_name}".constantize
124
+ rescue
125
+ message = "The configured classifier type '#{classifier_name}' is unknown."
126
+ raise ClassifierUnknownError, message
127
+ end
128
+
129
+ classifier_class = "Weka::Classifiers::#{classifier_name}".constantize
130
+ options = @config.classifier_options
131
+
132
+ puts "Loading classifier #{classifier_name} with options '#{options}'…"
133
+
134
+ if dataset.nil?
135
+ if @config.balanced_training_data?
136
+ puts 'using BALANCED training dataset'
137
+ dataset = TrainingDataset.balanced_instances
138
+ elsif @config.unbalanced_training_data?
139
+ puts 'using FULL (unbalanced) training dataset'
140
+ dataset = TrainingDataset.instances
141
+ elsif @config.oversampled_training_data?
142
+ puts 'using OVERSAMPLED training dataset'
143
+ dataset = TrainingDataset.oversampled_instances
144
+ end
145
+ end
146
+
147
+ if @config.use_occ?
148
+ dataset.rename_attribute_value(
149
+ dataset.class_index,
150
+ one_class_index,
151
+ Instances::OUTLIER
152
+ )
153
+ end
154
+
155
+ @dataset = dataset
156
+
157
+ begin
158
+ classifier = classifier_class.build do
159
+ use_options options if options
160
+ train_with_instances dataset
161
+ end
162
+
163
+ classifier
164
+ rescue => error
165
+ raise "Error while loading classifier: #{error}"
166
+ end
167
+ end
168
+
169
+ def one_class_index
170
+ if @config.classifier_options =~ /#{Instances::VANDALISM}/
171
+ Instances::REGULAR_CLASS_INDEX
172
+ else
173
+ Instances::VANDALISM_CLASS_INDEX
174
+ end
175
+ end
176
+
177
+ # Returns the given dataset cleaned up the regular instances
178
+ def remove_regular_instances(dataset)
179
+ features = @config.features
180
+
181
+ vandalism_dataset = Weka::Core::Instances.new.with_attributes do
182
+ features.each { |name| numeric :"#{name.tr(' ', '_')}" }
183
+ nominal :class, values: [Instances::VANDALISM], class_attribute: true
184
+ end
185
+
186
+ dataset.to_a.map(&:values).each_with_index do |attributes, index|
187
+ class_value = Instances::CLASSES[dataset.instance(index).value(dataset.class_index).to_i]
188
+
189
+ if class_value == Instances::VANDALISM
190
+ values = attributes[0..-2]
191
+ vandalism_dataset.add_instance([*values, class_value])
192
+ end
193
+ end
194
+
195
+ filter = Weka::Filters::Unsupervised::Attribute::Normalize.new
196
+ vandalism_dataset = filter.filter(vandalism_dataset)
197
+
198
+ vandalism_dataset
199
+ end
200
+ end
201
+ end
202
+ end
@@ -0,0 +1,350 @@
1
+ require 'weka/classifiers/meta/one_class_classifier'
2
+ require 'singleton'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ require 'yaml'
7
+
8
+ def self.config
9
+ Configuration.instance
10
+ end
11
+
12
+ class Configuration
13
+ include Singleton
14
+
15
+ TRAINING_DATA_BALANCED = 'balanced'.freeze
16
+ TRAINING_DATA_UNBALANCED = 'unbalanced'.freeze
17
+ TRAINING_DATA_OVERSAMPLED = 'oversampled'.freeze
18
+ CONFIG_FILE = 'wikipedia-vandalism-detection.yml'.freeze
19
+
20
+ attr_reader :data,
21
+ :features,
22
+ :classifier_options,
23
+ :classifier_type,
24
+ :cross_validation_fold,
25
+ :output_base_directory,
26
+ :training_data_options
27
+
28
+ def initialize
29
+ config = DefaultConfiguration[DefaultConfiguration::DEFAULTS]
30
+ @config_from_file ||= config.load_config_file(config.config_file)
31
+
32
+ @data ||= @config_from_file ? config.deep_merge(@config_from_file) : config
33
+
34
+ @classifier_type = @data['classifier']['type']
35
+ @classifier_options = @data['classifier']['options']
36
+ @cross_validation_fold = @data['classifier']['cross-validation-fold']
37
+ @training_data_options = @data['classifier']['training-data-options']
38
+ @replace_missing_values = @data['classifier']['replace-missing-values'].to_s
39
+
40
+ @features = @data['features']
41
+ @output_base_directory = File.expand_path(@data['output']['base_directory'], __FILE__)
42
+ @training_arff_file_name = @data['output']['training']['arff_file']
43
+ @test_arff_file_name = @data['output']['test']['arff_file']
44
+ end
45
+
46
+ # Returns whether the classifier uses one class classification
47
+ def use_occ?
48
+ @classifier_type == Weka::Classifiers::Meta::OneClassClassifier.type
49
+ end
50
+
51
+ def replace_training_data_missing_values?
52
+ !!(@replace_missing_values =~ /(true|t|yes|y)/i)
53
+ end
54
+
55
+ # Returns a boolean value whether a balanced data set is used for
56
+ # classifier training.
57
+ # (balanced means: same number of vandalism and regular samples)
58
+ def balanced_training_data?
59
+ @training_data_options == TRAINING_DATA_BALANCED
60
+ end
61
+
62
+ # Returns a boolean value whether an unbalanced data set is used for
63
+ # classifier training.
64
+ # (unbalanced means: vandalism and regular samples are used as given in
65
+ # arff file)
66
+ def unbalanced_training_data?
67
+ @training_data_options == TRAINING_DATA_UNBALANCED ||
68
+ @training_data_options.nil? ||
69
+ (!balanced_training_data? && !oversampled_training_data?)
70
+ end
71
+
72
+ # Returns a boolean value whether a oversampled data set is used for
73
+ # classifier training.
74
+ # (oversampled means: a balanced dataset is enriched through vandalism
75
+ # instances if vandalism number is less than regular number)
76
+ def oversampled_training_data?
77
+ !@training_data_options.nil? &&
78
+ @training_data_options.include?(TRAINING_DATA_OVERSAMPLED)
79
+ end
80
+
81
+ # Returns a hash of the oversampled training data options.
82
+ # Allowed options are -p (-percent) and -u (-undersampling)
83
+ def oversampling_options
84
+ if oversampled_training_data?
85
+ params = @training_data_options.gsub(TRAINING_DATA_OVERSAMPLED, '').split('-')
86
+
87
+ percent_default = 100.0
88
+ undersampling_default = 100.0
89
+
90
+ percent_option = params.select { |param| param.match(/(p\s|percentage\s)\d+/i) }[0]
91
+ undersampling_option = params.select { |param| param.match(/(u\s|undersampling\s)/i) }[0]
92
+
93
+ percent = percent_option.nil? ? percent_default : percent_option.split.last.to_f
94
+ undersampling = undersampling_default
95
+
96
+ if undersampling_option
97
+ if !undersampling_option.match(/(true|t|yes|y)/i).nil?
98
+ undersampling_percentage = undersampling_option.split.last
99
+ undersampling = undersampling_percentage.nil? ? undersampling_default : undersampling_percentage.to_f
100
+ else
101
+ undersampling = 0.0
102
+ end
103
+ end
104
+
105
+ { percentage: percent, undersampling: undersampling }
106
+ else
107
+ {}
108
+ end
109
+ end
110
+
111
+ # Returns the path to the classification file.
112
+ # Automatically sub directories for classifier and training data options
113
+ # are added. Thus it results in
114
+ # <output base dir>/<classifier name>/<training data options>/<file name>
115
+ def test_output_classification_file
116
+ classifiction_file_name = @data['output']['test']['classification_file']
117
+ classifier_name = @classifier_type.split('::').last.downcase
118
+
119
+ File.join(
120
+ @output_base_directory,
121
+ classifier_name,
122
+ @training_data_options.gsub(/\s+/, '_'),
123
+ classifiction_file_name
124
+ )
125
+ end
126
+
127
+ # Returns the training arff file name.
128
+ # The path is expanded by used classifier & options and is in the same
129
+ # directory as the classification file.
130
+ def training_output_arff_file
131
+ directory = File.dirname(test_output_classification_file)
132
+ File.join(directory, @training_arff_file_name)
133
+ end
134
+
135
+ # Returns the test arff file name.
136
+ # The path is expanded by used classifier & options and is in the same
137
+ # directory as the classification file.
138
+ def test_output_arff_file
139
+ directory = File.dirname(test_output_classification_file)
140
+ File.join(directory, @test_arff_file_name)
141
+ end
142
+
143
+ # Returns file/path string for corpora files/directories and output files
144
+ # after following schema: <corpus type>_<progress stage>_<file name>.
145
+ #
146
+ # Instead of 'corpora' the word 'corpus' is used for grammatical reasons.
147
+ #
148
+ # example:
149
+ # training_corpus_edits_file()
150
+ # test_output_index_file()
151
+ #
152
+ def method_missing(method_name, *args)
153
+ if instance_variable_defined?("@#{method_name}")
154
+ return instance_variable_get("@#{method_name}")
155
+ end
156
+
157
+ file_path_parts = method_name.to_s.split('_')
158
+
159
+ if file_path_parts.count >= 4
160
+ corpus_type = file_path_parts[0]
161
+ progress_stage = file_path_parts[1]
162
+ file_path = file_path_parts[2..-1].join('_')
163
+
164
+ if progress_stage == 'corpus'
165
+ progress_stage = 'corpora'
166
+ path = File.join(
167
+ @data[progress_stage]['base_directory'],
168
+ @data[progress_stage][corpus_type]['base_directory']
169
+ )
170
+ elsif progress_stage == 'output'
171
+ path = @output_base_directory
172
+ else
173
+ return super
174
+ end
175
+
176
+ relative_path = File.join(path, @data[progress_stage][corpus_type][file_path])
177
+ absolute_path = File.expand_path(relative_path, __FILE__)
178
+ instance_variable_set("@#{method_name}", absolute_path)
179
+ else
180
+ super
181
+ end
182
+ end
183
+ end
184
+
185
+ # This class represents the default config which is merged with the
186
+ # customized config from config YAML file.
187
+ class DefaultConfiguration < Hash
188
+ DEFAULTS = {
189
+ 'source' => Dir.pwd,
190
+ 'features' => [
191
+ 'anonymity',
192
+ 'anonymity previous',
193
+ 'all wordlists frequency',
194
+ 'all wordlists impact',
195
+ 'article size',
196
+ 'bad frequency',
197
+ 'bad impact',
198
+ 'biased frequency',
199
+ 'biased impact',
200
+ 'blanking',
201
+ 'character sequence',
202
+ 'character diversity',
203
+ 'comment length',
204
+ 'comment biased frequency',
205
+ 'comment pronoun frequency',
206
+ 'comment vulgarism frequency',
207
+ 'compressibility',
208
+ 'copyedit',
209
+ 'digit ratio',
210
+ 'edits per user',
211
+ 'emoticons frequency',
212
+ 'emoticons impact',
213
+ 'inserted size',
214
+ 'inserted words',
215
+ 'inserted character distribution',
216
+ 'inserted external links',
217
+ 'inserted internal links',
218
+ 'longest word',
219
+ 'markup frequency',
220
+ 'markup impact',
221
+ 'non-alphanumeric ratio',
222
+ 'personal life',
223
+ 'pronoun frequency',
224
+ 'pronoun impact',
225
+ 'removed size',
226
+ 'removed words',
227
+ 'removed all wordlists frequency',
228
+ 'removed bad frequency',
229
+ 'removed biased frequency',
230
+ 'removed character distribution',
231
+ 'removed emoticons frequency',
232
+ 'removed markup frequency',
233
+ 'removed pronoun frequency',
234
+ 'removed sex frequency',
235
+ 'removed vulgarism frequency',
236
+ 'replacement similarity',
237
+ 'reverted',
238
+ 'revisions character distribution',
239
+ 'sex frequency',
240
+ 'sex impact',
241
+ 'same editor',
242
+ 'size increment',
243
+ 'size ratio',
244
+ 'term frequency',
245
+ 'time interval',
246
+ 'time of day',
247
+ 'upper case ratio',
248
+ 'upper case words ratio',
249
+ 'upper to lower case ratio',
250
+ 'vulgarism frequency',
251
+ 'vulgarism impact',
252
+ 'weekday',
253
+ 'words increment'
254
+ ],
255
+ 'corpora' => {
256
+ 'base_directory' => nil,
257
+ 'training' => {
258
+ 'base_directory' => nil,
259
+ 'edits_file' => nil,
260
+ 'annotations_file' => nil,
261
+ 'revisions_directory' => nil
262
+ },
263
+ 'test' => {
264
+ 'base_directory' => nil,
265
+ 'edits_file' => nil,
266
+ 'revisions_directory' => nil,
267
+ 'ground_truth_file' => nil
268
+ }
269
+ },
270
+ 'output' => {
271
+ 'base_directory' => File.join(Dir.pwd, 'build'),
272
+ 'training' => {
273
+ 'arff_file' => 'training.arff',
274
+ 'index_file' => 'training_index.yml'
275
+ },
276
+ 'test' => {
277
+ 'arff_file' => 'test.arff',
278
+ 'index_file' => 'test_index.yml',
279
+ 'classification_file' => 'classification.txt'
280
+ }
281
+ },
282
+ 'classifier' => {
283
+ 'type' => nil,
284
+ 'options' => nil,
285
+ 'cross-validation-fold' => 10,
286
+ 'training-data-options' => 'unbalanced',
287
+ 'replace-missing-values' => nil
288
+ }
289
+ }.freeze
290
+
291
+ def source
292
+ DEFAULTS['source']
293
+ end
294
+
295
+ # Looks in two places for a custom config file:
296
+ # in <app_root>/config/ and in <app_root>/lib/config
297
+ def config_file
298
+ config_file_path = "config/#{Configuration::CONFIG_FILE}"
299
+ root_file = File.join(source, config_file_path)
300
+ lib_file = File.join(source, "lib/#{config_file_path}")
301
+
302
+ first_parent_file = find_first_parent_path_for(
303
+ File.expand_path(File.dirname(__FILE__)),
304
+ config_file_path
305
+ )
306
+
307
+ if File.exist?(root_file)
308
+ root_file
309
+ elsif File.exist?(lib_file)
310
+ lib_file
311
+ else
312
+ first_parent_file
313
+ end
314
+ end
315
+
316
+ def load_config_file(file)
317
+ config_file = Configuration::CONFIG_FILE
318
+
319
+ if File.exist?(file) && file =~ /#{config_file}/
320
+ YAML.load_file(file)
321
+ else
322
+ warn %(
323
+
324
+ Configuration file not found in
325
+ #{source}/config,
326
+ #{source}/lib/config directory
327
+ or any other parent path.
328
+
329
+ To customize the system, create a '#{config_file}' file.
330
+
331
+ )
332
+ end
333
+ end
334
+
335
+ private
336
+
337
+ def find_first_parent_path_for(start_path, file)
338
+ file_path = File.join(start_path, file)
339
+
340
+ if File.exist?(file_path)
341
+ file_path
342
+ elsif File.dirname(start_path) != start_path
343
+ find_first_parent_path_for(File.dirname(start_path), file)
344
+ else
345
+ start_path
346
+ end
347
+ end
348
+ end
349
+ end
350
+ end
@@ -0,0 +1,36 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ require 'java'
4
+ require 'java/diffutils-1.3.0.jar'
5
+
6
+ java_import 'difflib.DiffUtils'
7
+
8
+ class Diff
9
+ def initialize(original, current)
10
+ @original = clean_text(original)
11
+ @current = clean_text(current)
12
+ @patch = DiffUtils.diff(@original.split, @current.split)
13
+ end
14
+
15
+ def inserted_words
16
+ @patch.deltas.map { |delta| delta.revised.lines }.flatten
17
+ end
18
+
19
+ def removed_words
20
+ @patch.deltas.map { |delta| delta.original.lines }.flatten
21
+ end
22
+
23
+ private
24
+
25
+ def clean_text(text)
26
+ text.encode(
27
+ 'UTF-8',
28
+ 'binary',
29
+ invalid: :replace,
30
+ undef: :replace,
31
+ replace: ''
32
+ )
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,81 @@
1
+ require 'wikipedia/vandalism_detection/diff'
2
+ require 'wikipedia/vandalism_detection/text'
3
+ require 'wikipedia/vandalism_detection/page'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ class Edit
8
+ attr_reader :old_revision, :new_revision
9
+ attr_accessor :page
10
+
11
+ def initialize(old_revision, new_revision, attributes = {})
12
+ message = "old revision: #{old_revision.id} | parent: #{old_revision.parent_id},
13
+ new revision: #{new_revision.id} | parent: #{new_revision.parent_id}"
14
+
15
+ unless sequent?(old_revision, new_revision)
16
+ raise ArgumentError, "Revisions are not sequent: #{message}."
17
+ end
18
+
19
+ @old_revision = old_revision
20
+ @new_revision = new_revision
21
+ @page = attributes[:page] || Page.new
22
+ end
23
+
24
+ def serialize(*attributes)
25
+ old_revision_parts = []
26
+ new_revision_parts = []
27
+
28
+ attributes.each do |attr|
29
+ if @old_revision.respond_to?(attr)
30
+ old_revision_parts.push @old_revision.method(attr).call
31
+ end
32
+ end
33
+
34
+ attributes.each do |attr|
35
+ if @new_revision.respond_to?(attr)
36
+ new_revision_parts.push @new_revision.method(attr).call
37
+ end
38
+ end
39
+
40
+ old_revision_string = old_revision_parts.join(',')
41
+ new_revision_string = new_revision_parts.join(',')
42
+
43
+ "#{old_revision_string}\t#{new_revision_string}"
44
+ end
45
+
46
+ # Returns an array of the words inserted in the new revision compared with
47
+ # the old one.
48
+ def inserted_words
49
+ @diff ||= Diff.new(@old_revision.text, @new_revision.text)
50
+ @inserted_words ||= @diff.inserted_words
51
+ end
52
+
53
+ # Returns a Text of the words inserted in the new revision compared with
54
+ # the old one.
55
+ def inserted_text
56
+ @inserted_text ||= Text.new(inserted_words.join(' '))
57
+ end
58
+
59
+ # Returns an array of the words removed in the new revision compared with
60
+ # the old one.
61
+ def removed_words
62
+ @diff ||= Diff.new(@old_revision.text, @new_revision.text)
63
+ @removed_words ||= @diff.removed_words
64
+ end
65
+
66
+ # Returns a Text of the words removed in the new revision compared with
67
+ # the old one.
68
+ def removed_text
69
+ @removed_text ||= Text.new(removed_words.join(' '))
70
+ end
71
+
72
+ protected
73
+
74
+ # Returns whether the given revisions are sequent, i.e. the old revisions
75
+ # id is the the new revisions parent id.
76
+ def sequent?(old_revision, new_revision)
77
+ new_revision.parent_id == old_revision.id
78
+ end
79
+ end
80
+ end
81
+ end