wikipedia-vandalism_detection 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,12 @@
1
+ require 'wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ module Algorithms
6
+ def kullback_leibler_divergence(text_a, text_b)
7
+ divergence = KullbackLeiblerDivergence.new
8
+ divergence.of(text_a, text_b)
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,202 @@
1
+ require 'weka'
2
+ require 'active_support/core_ext/string'
3
+ require 'fileutils'
4
+
5
+ require 'wikipedia/vandalism_detection/configuration'
6
+ require 'wikipedia/vandalism_detection/edit'
7
+ require 'wikipedia/vandalism_detection/feature_calculator'
8
+ require 'wikipedia/vandalism_detection/instances'
9
+ require 'wikipedia/vandalism_detection/evaluator'
10
+
11
+ module Wikipedia
12
+ module VandalismDetection
13
+ class Classifier
14
+ attr_reader :evaluator, :dataset
15
+
16
+ # Loads the classifier instance configured in the config file.
17
+ def initialize(dataset = nil)
18
+ @config = Wikipedia::VandalismDetection.config
19
+ @feature_calculator = FeatureCalculator.new
20
+ @classifier = load_classifier(dataset)
21
+ @evaluator = Evaluator.new(self)
22
+ end
23
+
24
+ # Returns the concrete classifier instance configured in the config file
25
+ # When you configured a Trees::RandomForest classifier you will get a
26
+ # Weka::Classifiers::Trees::RandomForest instance.
27
+ # This instance can be used for native function callings of the classifier
28
+ # class.
29
+ def classifier_instance
30
+ @classifier
31
+ end
32
+
33
+ # Classifies an edit or a set of features and returns the vandalism
34
+ # confidence by default.
35
+ # If option 'return_all_params: true' is set, it returns a Hash of form
36
+ # { confidence => ..., class_index => ...}
37
+ #
38
+ # @example
39
+ # # suppose you have a dataset with 2 feature or 'edit' as an instance
40
+ # # of Wikipedia::VandalismDetection::Edit
41
+ # classifier = Wikipedia::VandalsimDetection::Classifier.new
42
+ # features = [0.45, 0.67]
43
+ #
44
+ # confidence = classifier.classify(features)
45
+ # confidence = classifier.classify(edit)
46
+ def classify(edit_or_features, options = {})
47
+ features = @config.features
48
+ param_is_features = edit_or_features.is_a?(Array) && edit_or_features.size == features.count
49
+ param_is_edit = edit_or_features.is_a? Edit
50
+
51
+ unless param_is_edit || param_is_features
52
+ message = 'Input has to be an Edit or an Array of feature values.'
53
+ raise ArgumentError, message
54
+ end
55
+
56
+ feature_values = param_is_edit ? @feature_calculator.calculate_features_for(edit_or_features) : edit_or_features
57
+ return -1.0 if feature_values.empty?
58
+
59
+ feature_values = feature_values.map do |i|
60
+ i == Features::MISSING_VALUE ? nil : i
61
+ end
62
+
63
+ dataset = Instances.empty
64
+ dataset.set_class_index(feature_values.count)
65
+ dataset.add_instance([*feature_values, Instances::VANDALISM])
66
+
67
+ instance = dataset.instance(0)
68
+ instance.set_class_missing
69
+
70
+ if @config.use_occ?
71
+ if @config.classifier_options =~ /#{Instances::VANDALISM}/
72
+ index = Instances::VANDALISM_CLASS_INDEX
73
+ else
74
+ index = Instances::REGULAR_CLASS_INDEX
75
+ end
76
+ else
77
+ index = Instances::VANDALISM_CLASS_INDEX
78
+ end
79
+
80
+
81
+ confidence = @classifier.distribution_for_instance(instance).to_a[index]
82
+
83
+ if options[:return_all_params]
84
+ class_index = @classifier.classify_instance(instance)
85
+ class_index = class_index.nan? ? Instances::NOT_KNOWN_INDEX : class_index.to_i
86
+ results = { confidence: confidence, class_index: class_index }
87
+ else
88
+ results = confidence
89
+ end
90
+
91
+ results
92
+ end
93
+
94
+ # Cross validates the classifier.
95
+ # Fold is used as defined in configuration (default is 10).
96
+ #
97
+ # @example
98
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
99
+ # evaluation = classifier.cross_validate
100
+ # evaluation = classifier.cross_validate(equally_distributed: true)
101
+ #
102
+ def cross_validate(options = {})
103
+ @evaluator.cross_validate(options)
104
+ end
105
+
106
+ private
107
+
108
+ # Loads the (Weka-) Classifier set in the Configuration
109
+ def load_classifier(dataset)
110
+ classifier_name = @config.classifier_type
111
+
112
+ unless classifier_name
113
+ message = 'Classifier type is not defined in wikipedia-vandalism-detection.yml'
114
+ raise ClassifierNotConfiguredError, message
115
+ end
116
+
117
+ if @config.features.blank?
118
+ message = 'No features configured in wikipedia-vandalism-detection.yml'
119
+ raise FeaturesNotConfiguredError, message
120
+ end
121
+
122
+ begin
123
+ "Weka::Classifiers::#{classifier_name}".constantize
124
+ rescue
125
+ message = "The configured classifier type '#{classifier_name}' is unknown."
126
+ raise ClassifierUnknownError, message
127
+ end
128
+
129
+ classifier_class = "Weka::Classifiers::#{classifier_name}".constantize
130
+ options = @config.classifier_options
131
+
132
+ puts "Loading classifier #{classifier_name} with options '#{options}'…"
133
+
134
+ if dataset.nil?
135
+ if @config.balanced_training_data?
136
+ puts 'using BALANCED training dataset'
137
+ dataset = TrainingDataset.balanced_instances
138
+ elsif @config.unbalanced_training_data?
139
+ puts 'using FULL (unbalanced) training dataset'
140
+ dataset = TrainingDataset.instances
141
+ elsif @config.oversampled_training_data?
142
+ puts 'using OVERSAMPLED training dataset'
143
+ dataset = TrainingDataset.oversampled_instances
144
+ end
145
+ end
146
+
147
+ if @config.use_occ?
148
+ dataset.rename_attribute_value(
149
+ dataset.class_index,
150
+ one_class_index,
151
+ Instances::OUTLIER
152
+ )
153
+ end
154
+
155
+ @dataset = dataset
156
+
157
+ begin
158
+ classifier = classifier_class.build do
159
+ use_options options if options
160
+ train_with_instances dataset
161
+ end
162
+
163
+ classifier
164
+ rescue => error
165
+ raise "Error while loading classifier: #{error}"
166
+ end
167
+ end
168
+
169
+ def one_class_index
170
+ if @config.classifier_options =~ /#{Instances::VANDALISM}/
171
+ Instances::REGULAR_CLASS_INDEX
172
+ else
173
+ Instances::VANDALISM_CLASS_INDEX
174
+ end
175
+ end
176
+
177
+ # Returns the given dataset cleaned up the regular instances
178
+ def remove_regular_instances(dataset)
179
+ features = @config.features
180
+
181
+ vandalism_dataset = Weka::Core::Instances.new.with_attributes do
182
+ features.each { |name| numeric :"#{name.tr(' ', '_')}" }
183
+ nominal :class, values: [Instances::VANDALISM], class_attribute: true
184
+ end
185
+
186
+ dataset.to_a.map(&:values).each_with_index do |attributes, index|
187
+ class_value = Instances::CLASSES[dataset.instance(index).value(dataset.class_index).to_i]
188
+
189
+ if class_value == Instances::VANDALISM
190
+ values = attributes[0..-2]
191
+ vandalism_dataset.add_instance([*values, class_value])
192
+ end
193
+ end
194
+
195
+ filter = Weka::Filters::Unsupervised::Attribute::Normalize.new
196
+ vandalism_dataset = filter.filter(vandalism_dataset)
197
+
198
+ vandalism_dataset
199
+ end
200
+ end
201
+ end
202
+ end
@@ -0,0 +1,350 @@
1
+ require 'weka/classifiers/meta/one_class_classifier'
2
+ require 'singleton'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ require 'yaml'
7
+
8
+ def self.config
9
+ Configuration.instance
10
+ end
11
+
12
+ class Configuration
13
+ include Singleton
14
+
15
+ TRAINING_DATA_BALANCED = 'balanced'.freeze
16
+ TRAINING_DATA_UNBALANCED = 'unbalanced'.freeze
17
+ TRAINING_DATA_OVERSAMPLED = 'oversampled'.freeze
18
+ CONFIG_FILE = 'wikipedia-vandalism-detection.yml'.freeze
19
+
20
+ attr_reader :data,
21
+ :features,
22
+ :classifier_options,
23
+ :classifier_type,
24
+ :cross_validation_fold,
25
+ :output_base_directory,
26
+ :training_data_options
27
+
28
+ def initialize
29
+ config = DefaultConfiguration[DefaultConfiguration::DEFAULTS]
30
+ @config_from_file ||= config.load_config_file(config.config_file)
31
+
32
+ @data ||= @config_from_file ? config.deep_merge(@config_from_file) : config
33
+
34
+ @classifier_type = @data['classifier']['type']
35
+ @classifier_options = @data['classifier']['options']
36
+ @cross_validation_fold = @data['classifier']['cross-validation-fold']
37
+ @training_data_options = @data['classifier']['training-data-options']
38
+ @replace_missing_values = @data['classifier']['replace-missing-values'].to_s
39
+
40
+ @features = @data['features']
41
+ @output_base_directory = File.expand_path(@data['output']['base_directory'], __FILE__)
42
+ @training_arff_file_name = @data['output']['training']['arff_file']
43
+ @test_arff_file_name = @data['output']['test']['arff_file']
44
+ end
45
+
46
+ # Returns whether the classifier uses one class classification
47
+ def use_occ?
48
+ @classifier_type == Weka::Classifiers::Meta::OneClassClassifier.type
49
+ end
50
+
51
+ def replace_training_data_missing_values?
52
+ !!(@replace_missing_values =~ /(true|t|yes|y)/i)
53
+ end
54
+
55
+ # Returns a boolean value whether a balanced data set is used for
56
+ # classifier training.
57
+ # (balanced means: same number of vandalism and regular samples)
58
+ def balanced_training_data?
59
+ @training_data_options == TRAINING_DATA_BALANCED
60
+ end
61
+
62
+ # Returns a boolean value whether an unbalanced data set is used for
63
+ # classifier training.
64
+ # (unbalanced means: vandalism and regular samples are used as given in
65
+ # arff file)
66
+ def unbalanced_training_data?
67
+ @training_data_options == TRAINING_DATA_UNBALANCED ||
68
+ @training_data_options.nil? ||
69
+ (!balanced_training_data? && !oversampled_training_data?)
70
+ end
71
+
72
+ # Returns a boolean value whether a oversampled data set is used for
73
+ # classifier training.
74
+ # (oversampled means: a balanced dataset is enriched through vandalism
75
+ # instances if vandalism number is less than regular number)
76
+ def oversampled_training_data?
77
+ !@training_data_options.nil? &&
78
+ @training_data_options.include?(TRAINING_DATA_OVERSAMPLED)
79
+ end
80
+
81
+ # Returns a hash of the oversampled training data options.
82
+ # Allowed options are -p (-percent) and -u (-undersampling)
83
+ def oversampling_options
84
+ if oversampled_training_data?
85
+ params = @training_data_options.gsub(TRAINING_DATA_OVERSAMPLED, '').split('-')
86
+
87
+ percent_default = 100.0
88
+ undersampling_default = 100.0
89
+
90
+ percent_option = params.select { |param| param.match(/(p\s|percentage\s)\d+/i) }[0]
91
+ undersampling_option = params.select { |param| param.match(/(u\s|undersampling\s)/i) }[0]
92
+
93
+ percent = percent_option.nil? ? percent_default : percent_option.split.last.to_f
94
+ undersampling = undersampling_default
95
+
96
+ if undersampling_option
97
+ if !undersampling_option.match(/(true|t|yes|y)/i).nil?
98
+ undersampling_percentage = undersampling_option.split.last
99
+ undersampling = undersampling_percentage.nil? ? undersampling_default : undersampling_percentage.to_f
100
+ else
101
+ undersampling = 0.0
102
+ end
103
+ end
104
+
105
+ { percentage: percent, undersampling: undersampling }
106
+ else
107
+ {}
108
+ end
109
+ end
110
+
111
+ # Returns the path to the classification file.
112
+ # Automatically sub directories for classifier and training data options
113
+ # are added. Thus it results in
114
+ # <output base dir>/<classifier name>/<training data options>/<file name>
115
+ def test_output_classification_file
116
+ classifiction_file_name = @data['output']['test']['classification_file']
117
+ classifier_name = @classifier_type.split('::').last.downcase
118
+
119
+ File.join(
120
+ @output_base_directory,
121
+ classifier_name,
122
+ @training_data_options.gsub(/\s+/, '_'),
123
+ classifiction_file_name
124
+ )
125
+ end
126
+
127
+ # Returns the training arff file name.
128
+ # The path is expanded by used classifier & options and is in the same
129
+ # directory as the classification file.
130
+ def training_output_arff_file
131
+ directory = File.dirname(test_output_classification_file)
132
+ File.join(directory, @training_arff_file_name)
133
+ end
134
+
135
+ # Returns the test arff file name.
136
+ # The path is expanded by used classifier & options and is in the same
137
+ # directory as the classification file.
138
+ def test_output_arff_file
139
+ directory = File.dirname(test_output_classification_file)
140
+ File.join(directory, @test_arff_file_name)
141
+ end
142
+
143
+ # Returns file/path string for corpora files/directories and output files
144
+ # after following schema: <corpus type>_<progress stage>_<file name>.
145
+ #
146
+ # Instead of 'corpora' the word 'corpus' is used for grammatical reasons.
147
+ #
148
+ # example:
149
+ # training_corpus_edits_file()
150
+ # test_output_index_file()
151
+ #
152
+ def method_missing(method_name, *args)
153
+ if instance_variable_defined?("@#{method_name}")
154
+ return instance_variable_get("@#{method_name}")
155
+ end
156
+
157
+ file_path_parts = method_name.to_s.split('_')
158
+
159
+ if file_path_parts.count >= 4
160
+ corpus_type = file_path_parts[0]
161
+ progress_stage = file_path_parts[1]
162
+ file_path = file_path_parts[2..-1].join('_')
163
+
164
+ if progress_stage == 'corpus'
165
+ progress_stage = 'corpora'
166
+ path = File.join(
167
+ @data[progress_stage]['base_directory'],
168
+ @data[progress_stage][corpus_type]['base_directory']
169
+ )
170
+ elsif progress_stage == 'output'
171
+ path = @output_base_directory
172
+ else
173
+ return super
174
+ end
175
+
176
+ relative_path = File.join(path, @data[progress_stage][corpus_type][file_path])
177
+ absolute_path = File.expand_path(relative_path, __FILE__)
178
+ instance_variable_set("@#{method_name}", absolute_path)
179
+ else
180
+ super
181
+ end
182
+ end
183
+ end
184
+
185
+ # This class represents the default config which is merged with the
186
+ # customized config from config YAML file.
187
+ class DefaultConfiguration < Hash
188
+ DEFAULTS = {
189
+ 'source' => Dir.pwd,
190
+ 'features' => [
191
+ 'anonymity',
192
+ 'anonymity previous',
193
+ 'all wordlists frequency',
194
+ 'all wordlists impact',
195
+ 'article size',
196
+ 'bad frequency',
197
+ 'bad impact',
198
+ 'biased frequency',
199
+ 'biased impact',
200
+ 'blanking',
201
+ 'character sequence',
202
+ 'character diversity',
203
+ 'comment length',
204
+ 'comment biased frequency',
205
+ 'comment pronoun frequency',
206
+ 'comment vulgarism frequency',
207
+ 'compressibility',
208
+ 'copyedit',
209
+ 'digit ratio',
210
+ 'edits per user',
211
+ 'emoticons frequency',
212
+ 'emoticons impact',
213
+ 'inserted size',
214
+ 'inserted words',
215
+ 'inserted character distribution',
216
+ 'inserted external links',
217
+ 'inserted internal links',
218
+ 'longest word',
219
+ 'markup frequency',
220
+ 'markup impact',
221
+ 'non-alphanumeric ratio',
222
+ 'personal life',
223
+ 'pronoun frequency',
224
+ 'pronoun impact',
225
+ 'removed size',
226
+ 'removed words',
227
+ 'removed all wordlists frequency',
228
+ 'removed bad frequency',
229
+ 'removed biased frequency',
230
+ 'removed character distribution',
231
+ 'removed emoticons frequency',
232
+ 'removed markup frequency',
233
+ 'removed pronoun frequency',
234
+ 'removed sex frequency',
235
+ 'removed vulgarism frequency',
236
+ 'replacement similarity',
237
+ 'reverted',
238
+ 'revisions character distribution',
239
+ 'sex frequency',
240
+ 'sex impact',
241
+ 'same editor',
242
+ 'size increment',
243
+ 'size ratio',
244
+ 'term frequency',
245
+ 'time interval',
246
+ 'time of day',
247
+ 'upper case ratio',
248
+ 'upper case words ratio',
249
+ 'upper to lower case ratio',
250
+ 'vulgarism frequency',
251
+ 'vulgarism impact',
252
+ 'weekday',
253
+ 'words increment'
254
+ ],
255
+ 'corpora' => {
256
+ 'base_directory' => nil,
257
+ 'training' => {
258
+ 'base_directory' => nil,
259
+ 'edits_file' => nil,
260
+ 'annotations_file' => nil,
261
+ 'revisions_directory' => nil
262
+ },
263
+ 'test' => {
264
+ 'base_directory' => nil,
265
+ 'edits_file' => nil,
266
+ 'revisions_directory' => nil,
267
+ 'ground_truth_file' => nil
268
+ }
269
+ },
270
+ 'output' => {
271
+ 'base_directory' => File.join(Dir.pwd, 'build'),
272
+ 'training' => {
273
+ 'arff_file' => 'training.arff',
274
+ 'index_file' => 'training_index.yml'
275
+ },
276
+ 'test' => {
277
+ 'arff_file' => 'test.arff',
278
+ 'index_file' => 'test_index.yml',
279
+ 'classification_file' => 'classification.txt'
280
+ }
281
+ },
282
+ 'classifier' => {
283
+ 'type' => nil,
284
+ 'options' => nil,
285
+ 'cross-validation-fold' => 10,
286
+ 'training-data-options' => 'unbalanced',
287
+ 'replace-missing-values' => nil
288
+ }
289
+ }.freeze
290
+
291
+ def source
292
+ DEFAULTS['source']
293
+ end
294
+
295
+ # Looks in two places for a custom config file:
296
+ # in <app_root>/config/ and in <app_root>/lib/config
297
+ def config_file
298
+ config_file_path = "config/#{Configuration::CONFIG_FILE}"
299
+ root_file = File.join(source, config_file_path)
300
+ lib_file = File.join(source, "lib/#{config_file_path}")
301
+
302
+ first_parent_file = find_first_parent_path_for(
303
+ File.expand_path(File.dirname(__FILE__)),
304
+ config_file_path
305
+ )
306
+
307
+ if File.exist?(root_file)
308
+ root_file
309
+ elsif File.exist?(lib_file)
310
+ lib_file
311
+ else
312
+ first_parent_file
313
+ end
314
+ end
315
+
316
+ def load_config_file(file)
317
+ config_file = Configuration::CONFIG_FILE
318
+
319
+ if File.exist?(file) && file =~ /#{config_file}/
320
+ YAML.load_file(file)
321
+ else
322
+ warn %(
323
+
324
+ Configuration file not found in
325
+ #{source}/config,
326
+ #{source}/lib/config directory
327
+ or any other parent path.
328
+
329
+ To customize the system, create a '#{config_file}' file.
330
+
331
+ )
332
+ end
333
+ end
334
+
335
+ private
336
+
337
+ def find_first_parent_path_for(start_path, file)
338
+ file_path = File.join(start_path, file)
339
+
340
+ if File.exist?(file_path)
341
+ file_path
342
+ elsif File.dirname(start_path) != start_path
343
+ find_first_parent_path_for(File.dirname(start_path), file)
344
+ else
345
+ start_path
346
+ end
347
+ end
348
+ end
349
+ end
350
+ end
@@ -0,0 +1,36 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ require 'java'
4
+ require 'java/diffutils-1.3.0.jar'
5
+
6
+ java_import 'difflib.DiffUtils'
7
+
8
+ class Diff
9
+ def initialize(original, current)
10
+ @original = clean_text(original)
11
+ @current = clean_text(current)
12
+ @patch = DiffUtils.diff(@original.split, @current.split)
13
+ end
14
+
15
+ def inserted_words
16
+ @patch.deltas.map { |delta| delta.revised.lines }.flatten
17
+ end
18
+
19
+ def removed_words
20
+ @patch.deltas.map { |delta| delta.original.lines }.flatten
21
+ end
22
+
23
+ private
24
+
25
+ def clean_text(text)
26
+ text.encode(
27
+ 'UTF-8',
28
+ 'binary',
29
+ invalid: :replace,
30
+ undef: :replace,
31
+ replace: ''
32
+ )
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,81 @@
1
+ require 'wikipedia/vandalism_detection/diff'
2
+ require 'wikipedia/vandalism_detection/text'
3
+ require 'wikipedia/vandalism_detection/page'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ class Edit
8
+ attr_reader :old_revision, :new_revision
9
+ attr_accessor :page
10
+
11
+ def initialize(old_revision, new_revision, attributes = {})
12
+ message = "old revision: #{old_revision.id} | parent: #{old_revision.parent_id},
13
+ new revision: #{new_revision.id} | parent: #{new_revision.parent_id}"
14
+
15
+ unless sequent?(old_revision, new_revision)
16
+ raise ArgumentError, "Revisions are not sequent: #{message}."
17
+ end
18
+
19
+ @old_revision = old_revision
20
+ @new_revision = new_revision
21
+ @page = attributes[:page] || Page.new
22
+ end
23
+
24
+ def serialize(*attributes)
25
+ old_revision_parts = []
26
+ new_revision_parts = []
27
+
28
+ attributes.each do |attr|
29
+ if @old_revision.respond_to?(attr)
30
+ old_revision_parts.push @old_revision.method(attr).call
31
+ end
32
+ end
33
+
34
+ attributes.each do |attr|
35
+ if @new_revision.respond_to?(attr)
36
+ new_revision_parts.push @new_revision.method(attr).call
37
+ end
38
+ end
39
+
40
+ old_revision_string = old_revision_parts.join(',')
41
+ new_revision_string = new_revision_parts.join(',')
42
+
43
+ "#{old_revision_string}\t#{new_revision_string}"
44
+ end
45
+
46
+ # Returns an array of the words inserted in the new revision compared with
47
+ # the old one.
48
+ def inserted_words
49
+ @diff ||= Diff.new(@old_revision.text, @new_revision.text)
50
+ @inserted_words ||= @diff.inserted_words
51
+ end
52
+
53
+ # Returns a Text of the words inserted in the new revision compared with
54
+ # the old one.
55
+ def inserted_text
56
+ @inserted_text ||= Text.new(inserted_words.join(' '))
57
+ end
58
+
59
+ # Returns an array of the words removed in the new revision compared with
60
+ # the old one.
61
+ def removed_words
62
+ @diff ||= Diff.new(@old_revision.text, @new_revision.text)
63
+ @removed_words ||= @diff.removed_words
64
+ end
65
+
66
+ # Returns a Text of the words removed in the new revision compared with
67
+ # the old one.
68
+ def removed_text
69
+ @removed_text ||= Text.new(removed_words.join(' '))
70
+ end
71
+
72
+ protected
73
+
74
+ # Returns whether the given revisions are sequent, i.e. the old revisions
75
+ # id is the the new revisions parent id.
76
+ def sequent?(old_revision, new_revision)
77
+ new_revision.parent_id == old_revision.id
78
+ end
79
+ end
80
+ end
81
+ end