wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,640 @@
1
+ require 'wikipedia/vandalism_detection/configuration'
2
+ require 'wikipedia/vandalism_detection/exceptions'
3
+ require 'wikipedia/vandalism_detection/training_dataset'
4
+ require 'wikipedia/vandalism_detection/test_dataset'
5
+ require 'wikipedia/vandalism_detection/classifier'
6
+ require 'wikipedia/vandalism_detection/instances'
7
+ require 'weka'
8
+ require 'fileutils'
9
+ require 'csv'
10
+
11
+ module Wikipedia
12
+ module VandalismDetection
13
+ # This class provides methods for the evaluation of a
14
+ # Wikipedia::VandalismDetection::Classifier using the weka framwork.
15
+ #
16
+ # @example
17
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
18
+ # evaluator = Wikipedia::VandalsimDetection::Evaluator(classifier)
19
+ #
20
+ # evaluation = evaluator.cross_validate
21
+ # evaluation = evaluator.cross_validate(equally_distributed: true)
22
+ #
23
+ # puts evaluation[:precision]
24
+ # puts evaluation[:recall]
25
+ # puts evaluation[:area_under_prc]
26
+ class Evaluator
27
+ DEFAULT_SAMPLE_COUNT = 200
28
+ DEFAULTS = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS
29
+
30
+ def initialize(classifier)
31
+ unless classifier.is_a?(Wikipedia::VandalismDetection::Classifier)
32
+ message = 'The classifier argument has to be an instance of ' \
33
+ 'Wikipedia::VandalismDetection::Classifier'
34
+ raise ArgumentError, message
35
+ end
36
+
37
+ @config = Wikipedia::VandalismDetection.config
38
+ @classifier = classifier
39
+ @classifier_instance = classifier.classifier_instance
40
+ end
41
+
42
+ # Cross validates the classifier.
43
+ # Fold is used as defined in configuration (default is 10).
44
+ #
45
+ # @example
46
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
47
+ # evaluation = classifier.cross_validate
48
+ # evaluation = classifier.cross_validate(equally_distributed: true)
49
+ #
50
+ def cross_validate(options = {})
51
+ equally_distributed = options[:equally_distributed]
52
+
53
+ fold_defaults = DEFAULTS['classifier']['cross-validation-fold']
54
+ fold = @config.cross_validation_fold || fold_defaults
55
+
56
+ if equally_distributed
57
+ cross_validate_equally_distributed(fold)
58
+ else
59
+ cross_validate_all_instances(fold)
60
+ end
61
+ end
62
+
63
+ # Returns a Hash comprising the evaluation curve data Arrays for precision, recall
64
+ #
65
+ # @example
66
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
67
+ # evaluator = classifier.evaluator
68
+ # or
69
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
70
+ # evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
71
+ #
72
+ # curve_data = evaluator.curve_data
73
+ #
74
+ # curve_data[:precision]
75
+ # # => [0.76, ..., 0.91]
76
+ #
77
+ # curve_data[:recall]
78
+ # # => [0.87, ..., 0.89]
79
+ #
80
+ # curve_data[:area_under_prc]
81
+ # # => 0.83
82
+ def curve_data(options = {})
83
+ evaluations = cross_validate(options)
84
+ threshold_curve = Weka::Classifiers::Evaluation::ThresholdCurve.new
85
+
86
+ evaluation_data = evaluations.is_a?(Array) ? evaluations[0] : evaluations
87
+
88
+ instances = threshold_curve.curve(
89
+ evaluation_data.predictions,
90
+ Instances::VANDALISM_CLASS_INDEX
91
+ )
92
+
93
+ precision = instances.attribute_to_double_array(0).to_a
94
+ recall = instances.attribute_to_double_array(1).to_a
95
+ area_under_prc = evaluation_data.area_under_prc(Instances::VANDALISM_CLASS_INDEX)
96
+
97
+ {
98
+ precision: precision,
99
+ recall: recall,
100
+ area_under_prc: area_under_prc
101
+ }
102
+ end
103
+
104
+ # Evaluates the classification of the configured test corpus against the
105
+ # given ground truth.
106
+ # Runs the file creation automatically unless the classification file
107
+ # exists, yet.
108
+ #
109
+ # Number of samples to use can be set by 'sample_count: <number>'
110
+ # option. Default number of samples is 100.
111
+ #
112
+ # Returns a Hash with values:
113
+ # :recalls - recall values
114
+ # :precisions - precision values
115
+ # :fp_rates - fals positive rate values
116
+ # :auprc - area under precision recall curve
117
+ # :auroc - area under receiver operator curve
118
+ # :total_recall - overall classifier recall value
119
+ # :total_precision - overall classifier precision value
120
+ #
121
+ # @example
122
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
123
+ # evaluator = classifier.evaluator
124
+ # or
125
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
126
+ # evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
127
+ #
128
+ # evaluator.evaluate_testcorpus_classification
129
+ # evaluator.evaluate_testcorpus_classification(sample_count: 50)
130
+ #
131
+ def evaluate_testcorpus_classification(options = {})
132
+ ground_truth_file_path = @config.test_corpus_ground_truth_file
133
+
134
+ unless ground_truth_file_path
135
+ message = 'Ground truth file path has to be set for test set evaluation'
136
+ raise GroundTruthFileNotConfiguredError, message
137
+ end
138
+
139
+ unless File.exist?(ground_truth_file_path)
140
+ message = 'Configured ground truth file is not available.'
141
+ raise GroundTruthFileNotFoundError, message
142
+ end
143
+
144
+ ground_truth = ground_truth_hash(ground_truth_file_path)
145
+ create_testcorpus_classification_file!(@config.test_output_classification_file, ground_truth)
146
+ classification = classification_hash(@config.test_output_classification_file)
147
+
148
+ sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
149
+ curves = test_performance_curves(ground_truth, classification, sample_count)
150
+ precision_recall = maximum_precision_recall(curves[:precisions], curves[:recalls])
151
+
152
+ curves[:total_recall] = precision_recall[:recall]
153
+ curves[:total_precision] = precision_recall[:precision]
154
+
155
+ curves
156
+ end
157
+
158
+ # Returns the performance curve points (recall, precision, fp-rate) and
159
+ # computed area under curves.
160
+ def test_performance_curves(ground_truth, classification, sample_count)
161
+ thresholds = (0.0...1.0).step(1.0 / sample_count.to_f).to_a
162
+
163
+ # remove first value to not use the [0,1] value in curve
164
+ thresholds.shift
165
+
166
+ precisions = []
167
+ recalls = []
168
+ fp_rates = []
169
+
170
+ thresholds.each do |threshold|
171
+ values = predictive_values(ground_truth, classification, threshold)
172
+ performance_params = performance_parameters(
173
+ values[:tp],
174
+ values[:fp],
175
+ values[:tn],
176
+ values[:fn]
177
+ )
178
+
179
+ precisions.push performance_params[:precision]
180
+ recalls.push performance_params[:recall]
181
+ fp_rates.push performance_params[:fp_rate]
182
+ end
183
+
184
+ tp_rates = recalls
185
+ pr_sorted = sort_curve_values(recalls, precisions, x: 0.0, y: 0.0)
186
+ roc_sorted = sort_curve_values(fp_rates, tp_rates, y: 0.0, x: 1.0)
187
+
188
+ recalls = pr_sorted[:x]
189
+ precisions = pr_sorted[:y]
190
+ fp_rates = roc_sorted[:x]
191
+ tp_rates = roc_sorted[:y]
192
+
193
+ pr_auc = area_under_curve(recalls, precisions)
194
+ roc_auc = area_under_curve(fp_rates, tp_rates)
195
+
196
+ {
197
+ precisions: precisions, recalls: recalls,
198
+ fp_rates: fp_rates, tp_rates: tp_rates,
199
+ pr_auc: pr_auc, roc_auc: roc_auc
200
+ }
201
+ end
202
+
203
+ # Returns the predictive values hash (TP,FP, TN, FN) for a certain
204
+ # threshold.
205
+ def predictive_values(ground_truth, classification, threshold)
206
+ tp = 0 # vandalism which is classified as vandalism
207
+ fp = 0 # regular that is classified as vandalism
208
+ tn = 0 # regular that is classified as regular
209
+ fn = 0 # vandalism that is classified as regular
210
+
211
+ ground_truth.each do |sample|
212
+ values = sample[1]
213
+ target_class = values[:class]
214
+
215
+ key = :"#{values[:old_revision_id]}-#{values[:new_revision_id]}"
216
+ # go on if annotated is not in classification
217
+ next unless classification.key?(key)
218
+
219
+ confidence = classification[key][:confidence]
220
+
221
+ tp += 1 if Evaluator.true_positive?(target_class, confidence, threshold) # True Positives
222
+ fn += 1 if Evaluator.false_negative?(target_class, confidence, threshold) # False Negatives
223
+ fp += 1 if Evaluator.false_positive?(target_class, confidence, threshold) # False Positives
224
+ tn += 1 if Evaluator.true_negative?(target_class, confidence, threshold) # True Negatives
225
+ end
226
+
227
+ { tp: tp, fp: fp, tn: tn, fn: fn }
228
+ end
229
+
230
+ # Returns whether the given confidence value represents a
231
+ # true positive (TP) regarding the given target class and threshold.
232
+ def self.true_positive?(target_class, confidence, threshold)
233
+ target_class == Instances::VANDALISM_SHORT && confidence.to_f > threshold.to_f
234
+ end
235
+
236
+ # Returns whether the given confidence value represents a
237
+ # true negative (TN) regarding the given target class and threshold.
238
+ def self.true_negative?(target_class, confidence, threshold)
239
+ target_class == Instances::REGULAR_SHORT && confidence.to_f < threshold.to_f
240
+ end
241
+
242
+ # Returns whether the given confidence value represents a
243
+ # false positive (FP) regarding the given target class and threshold.
244
+ def self.false_positive?(target_class, confidence, threshold)
245
+ target_class == Instances::REGULAR_SHORT && confidence.to_f >= threshold.to_f
246
+ end
247
+
248
+ # Returns whether the given confidence value represents a
249
+ # false negative (FN) regarding the given target class and threshold.
250
+ def self.false_negative?(target_class, confidence, threshold)
251
+ target_class == Instances::VANDALISM_SHORT && confidence.to_f <= threshold.to_f
252
+ end
253
+
254
+ # Returns a hash with performance parameters computed from given
255
+ # TP, FP, TN, FN
256
+ def performance_parameters(tp, fp, tn, fn)
257
+ precision = (tp + fp).zero? ? 1.0 : tp.to_f / (tp.to_f + fp.to_f)
258
+ recall = (tp + fn).zero? ? 1.0 : tp.to_f / (tp.to_f + fn.to_f)
259
+ fp_rate = (fp + tn).zero? ? 1.0 : fp.to_f / (fp.to_f + tn.to_f)
260
+
261
+ {
262
+ precision: precision,
263
+ recall: recall,
264
+ fp_rate: fp_rate
265
+ }
266
+ end
267
+
268
+ # Returns the calculated area under curve for given point values
269
+ # x and y values has to be float arrays of the same length.
270
+ def area_under_curve(x_values, y_values)
271
+ unless x_values.count == y_values.count
272
+ raise ArgumentError, 'x and y values must have the same length!'
273
+ end
274
+
275
+ sum = 0.0
276
+ last_index = x_values.size - 1
277
+
278
+ # trapezoid area formular: A = 1/2 * (b1 + b2) * h
279
+ x_values.each_with_index do |x, index|
280
+ break if index == last_index
281
+
282
+ h = x_values[index + 1] - x
283
+ b1 = y_values[index]
284
+ b2 = y_values[index + 1]
285
+
286
+ sum += 0.5 * (b1 + b2) * h
287
+ end
288
+
289
+ sum.abs
290
+ end
291
+
292
+ # Returns given value array sorted by first array (x_values)
293
+ # Return value is a Hash { x: <x_values_sorted>, y: <y_values_sorted_by_x> }
294
+ # start_value is added in front of arrays if set, e.g. {x: 0.0, y: 1.0}
295
+ # end_values is added to end of arrays if set, e.g. {x: 1.0, y: 1.0 }
296
+ #
297
+ # @example
298
+ # evaluator.sort_curve_values(x, y, { x: 0.0, y: 0.0 }, { x: 1.0, y: 1.0 })
299
+ # #=>Hash { x: [0.0, *x, 1.0], y: [0.0, *y, 1.0] }
300
+ def sort_curve_values(x_values, y_values, start_values = nil, end_values = nil)
301
+ merge_sorted = x_values.each_with_index.map { |x, index| [x, y_values[index]] }
302
+ merge_sorted = merge_sorted.sort_by { |values| [values[0], - values[1]] }.uniq
303
+
304
+ x = merge_sorted.transpose[0]
305
+ y = merge_sorted.transpose[1]
306
+
307
+ start_values_set = start_values && (start_values.key?(:x) || start_values.key?(:y))
308
+ end_values_set = end_values && (end_values.key?(:x) || end_values.key?(:y))
309
+
310
+ if start_values_set
311
+ unless x.first == start_values[:x] && y.first == start_values[:y]
312
+ x.unshift(start_values[:x] || x.first)
313
+ y.unshift(start_values[:y] || y.first)
314
+ end
315
+ end
316
+
317
+ if end_values_set
318
+ unless x.last == end_values[:x] && y.last == end_values[:y]
319
+ x.push(end_values[:x] || x.last)
320
+ y.push(end_values[:y] || y.last)
321
+ end
322
+ end
323
+
324
+ { x: x, y: y }
325
+ end
326
+
327
+ # Returns the maximum precision recall pair
328
+ def maximum_precision_recall(precisions, recalls)
329
+ areas = precisions.each_with_index.map do |precision, index|
330
+ [precision * recalls[index], index]
331
+ end
332
+
333
+ # remove arrays with NaN values
334
+ areas.select! { |b| b.all? { |f| !f.to_f.nan? } }
335
+ max_index = areas.sort.max[1]
336
+
337
+ { precision: precisions[max_index], recall: recalls[max_index] }
338
+ end
339
+
340
+ # Creates the test corpus text file by classifying the configured test
341
+ # samples. All sub steps (as creating the test arff file, etc.) are run
342
+ # automatically if needed.
343
+ def create_testcorpus_classification_file!(file_path, ground_truth_data)
344
+ if ground_truth_data.nil?
345
+ raise ArgumentError, 'Ground truth data hash is not allowed to be nil'
346
+ end
347
+
348
+ dataset = TestDataset.build!
349
+
350
+ dir_name = File.dirname(file_path)
351
+ FileUtils.mkdir_p(dir_name) unless Dir.exist?(dir_name)
352
+ file = File.open(file_path, 'w')
353
+
354
+ feature_names = dataset.attribute_names.map(&:upcase)[0...-2]
355
+ header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *feature_names].join(' ')
356
+
357
+ file.puts header
358
+
359
+ dataset.to_m.to_a.each do |instance|
360
+ features = instance[0...-3]
361
+ old_revision_id = instance[-3].to_i
362
+ new_revision_id = instance[-2].to_i
363
+ ground_truth_class_name = Instances::CLASSES_SHORT[Instances::CLASSES.key(instance[-1])]
364
+
365
+ classification = @classifier.classify(features, return_all_params: true)
366
+
367
+ if classification[:class_index] == Instances::VANDALISM_CLASS_INDEX
368
+ class_value = 1.0
369
+ elsif classification[:class_index] == Instances::REGULAR_CLASS_INDEX
370
+ class_value = 0.0
371
+ else
372
+ class_value = Features::MISSING_VALUE
373
+ end
374
+
375
+ confidence = classification[:confidence] || class_value
376
+
377
+ must_be_inverted = @config.use_occ? && !!(@classifier.classifier_instance.options =~ /#{Instances::VANDALISM}/)
378
+ confidence_value = must_be_inverted ? 1.0 - confidence : confidence
379
+ features = features.join(' ').gsub(Float::NAN.to_s, Features::MISSING_VALUE).split
380
+
381
+ file.puts [
382
+ old_revision_id,
383
+ new_revision_id,
384
+ ground_truth_class_name,
385
+ confidence_value,
386
+ *features
387
+ ].join(' ')
388
+ end
389
+
390
+ file.close
391
+ end
392
+
393
+ # Returns a hash comprising each feature's predictive values analysis for
394
+ # different thresholds.
395
+ # The Hash structure is the following one:
396
+ # {
397
+ # feature_name_1:
398
+ # {
399
+ # 0.0 => {fp: , fn: , tp: , tn: },
400
+ # ... => {fp: , fn: , tp: , tn: },
401
+ # 1.0 => {fp: , fn: , tp: , tn: }
402
+ # },
403
+ # ...,
404
+ # feature_name_n:
405
+ # {
406
+ # 0.0 => {fp: , fn: , tp: , tn: },
407
+ # ... => {fp: , fn: , tp: , tn: },
408
+ # 1.0 => {fp: , fn: , tp: , tn: }
409
+ # },
410
+ # }
411
+ def feature_analysis(options = {})
412
+ sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
413
+ thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
414
+
415
+ ground_truth_file_path = @config.test_corpus_ground_truth_file
416
+ training_dataset = TrainingDataset.instances
417
+ test_dataset = TestDataset.build!
418
+
419
+ analysis = {}
420
+
421
+ @config.features.each_with_index do |feature_name, index|
422
+ puts "analyzing feature… '#{feature_name}'"
423
+
424
+ dataset = filter_single_attribute(training_dataset, index)
425
+ print ' | train classifier with feature data…'
426
+ classifier = Classifier.new(dataset)
427
+ print "done \n"
428
+
429
+ classification = classification_data(classifier, test_dataset)
430
+ ground_truth = ground_truth_hash(ground_truth_file_path)
431
+
432
+ values = {}
433
+
434
+ thresholds.each do |threshold|
435
+ values[threshold] = predictive_values(ground_truth, classification, threshold)
436
+ end
437
+
438
+ analysis[feature_name] = values
439
+ end
440
+
441
+ analysis
442
+ end
443
+
444
+ # Returns a hash comprising the classifiers predictive values for using
445
+ # all configured features for different thresholds.
446
+ def full_analysis(options = {})
447
+ sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
448
+ thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
449
+
450
+ ground_truth_file_path = @config.test_corpus_ground_truth_file
451
+
452
+ puts 'train classifier…'
453
+ classifier = Classifier.new
454
+
455
+ test_dataset = TestDataset.build!
456
+
457
+ puts 'computing classification…'
458
+ classification = classification_data(classifier, test_dataset)
459
+ ground_truth = ground_truth_hash(ground_truth_file_path)
460
+
461
+ analysis = {}
462
+
463
+ thresholds.each do |threshold|
464
+ analysis[threshold] = predictive_values(ground_truth, classification, threshold)
465
+ end
466
+
467
+ print "done\n"
468
+ analysis
469
+ end
470
+
471
+ private
472
+
473
+ # Returns a dataset only holding the attribute at the given index.
474
+ # Weka Unsupervised Attribute Remove filter is used.
475
+ def filter_single_attribute(dataset, attribute_index)
476
+ filter = Weka::Filters::Unsupervised::Attribute::Remove.new
477
+ filter.use_options("-V -R #{attribute_index + 1},#{dataset.class_index + 1}")
478
+
479
+ filtered = filter.filter(dataset)
480
+ filtered.class_index = filtered.attributes_count - 1
481
+ filtered
482
+ end
483
+
484
+ # Returns an array of classification confidences of the test corpus'
485
+ # classification with the given classifier
486
+ def classification_data(classifier, test_dataset)
487
+ classification = {}
488
+
489
+ test_dataset.to_m.to_a.each do |instance|
490
+ features = instance[0...-3]
491
+
492
+ old_revision_id = instance[-3].to_i
493
+ new_revision_id = instance[-2].to_i
494
+
495
+ params = classifier.classify(features, return_all_params: true)
496
+ class_short_name = Instances::CLASSES_SHORT[params[:class_index]]
497
+
498
+ must_be_inverted = @config.use_occ? && @classifier.classifier_instance.options !~ /#{Instances::VANDALISM}/
499
+ confidence = must_be_inverted ? 1.0 - params[:confidence] : params[:confidence]
500
+
501
+ classification[:"#{old_revision_id}-#{new_revision_id}"] = {
502
+ old_revision_id: old_revision_id,
503
+ new_revision_id: new_revision_id,
504
+ class: class_short_name,
505
+ confidence: confidence
506
+ }
507
+ end
508
+
509
+ classification
510
+ end
511
+
512
+ # Returns a hash for classification data from given classification file
513
+ def classification_hash(classification_file)
514
+ file = File.read(classification_file)
515
+ classification_samples = file.lines.to_a
516
+ classification_samples.shift # remove header line
517
+
518
+ classification = {}
519
+
520
+ classification_samples.each do |line|
521
+ line_parts = line.split(' ')
522
+
523
+ old_revision_id = line_parts[0].to_i
524
+ new_revision_id = line_parts[1].to_i
525
+ class_short = line_parts[2]
526
+ confidence = line_parts[3].to_f
527
+
528
+ classification[:"#{old_revision_id}-#{new_revision_id}"] = {
529
+ old_revision_id: old_revision_id,
530
+ new_revision_id: new_revision_id,
531
+ class: class_short,
532
+ confidence: confidence
533
+ }
534
+ end
535
+
536
+ classification
537
+ end
538
+
539
+ # Returns a hash for classification data from given ground truth file
540
+ def ground_truth_hash(ground_truth_file)
541
+ file = File.read(ground_truth_file)
542
+ ground_truth_samples = file.lines.to_a
543
+
544
+ ground_truth = {}
545
+
546
+ ground_truth_samples.each do |line|
547
+ line_parts = line.split(' ')
548
+
549
+ old_revision_id = line_parts[0].to_i
550
+ new_revision_id = line_parts[1].to_i
551
+ class_short = line_parts[2]
552
+
553
+ ground_truth[:"#{old_revision_id}-#{new_revision_id}"] = {
554
+ old_revision_id: old_revision_id,
555
+ new_revision_id: new_revision_id,
556
+ class: class_short
557
+ }
558
+ end
559
+
560
+ ground_truth
561
+ end
562
+
563
+ # Cross validates classifier over full dataset with <fold>-fold cross
564
+ # validation
565
+ def cross_validate_all_instances(fold)
566
+ @classifier_instance.cross_validate(folds: fold)
567
+ rescue => error
568
+ raise "Error while cross validation: #{error}"
569
+ end
570
+
571
+ # Cross validates classifier over equally distributed dataset with
572
+ # <fold>-fold cross validation
573
+ def cross_validate_equally_distributed(fold)
574
+ dirname = @config.output_base_directory
575
+ FileUtils.mkdir(dirname) unless Dir.exist?(dirname)
576
+
577
+ file_name = 'cross_validation_eq_distr.txt'
578
+ file_path = File.join(dirname, file_name)
579
+
580
+ puts "Writing to #{file_path}…"
581
+ result_file = File.open(file_path, 'a')
582
+
583
+ begin
584
+ time = Time.now.strftime('%Y-%m-%d %H:%M')
585
+ type = @config.classifier_type
586
+ options = @config.classifier_options || 'default'
587
+ result_file.puts "\nCROSS VALIDATION - #{fold} fold (Classifier: #{type}, options: #{options} ) | #{time}"
588
+ result_file.puts "Features: \n\t#{@config.features.join("\n\t")}\n\n"
589
+
590
+ evaluations = []
591
+
592
+ times = 10
593
+
594
+ # run n times validation
595
+ (1..times).each do |i|
596
+ uniform_dataset = TrainingDataset.balanced_instances
597
+
598
+ print "\rcross validate dataset (equally distributed)… #{i}/#{times} | instances: #{uniform_dataset.size}"
599
+ @classifier_instance.train_with_instances(uniform_dataset)
600
+ evaluations << @classifier_instance.cross_validate(folds: fold)
601
+
602
+ if (i % (times / 10)).zero?
603
+ print_evaluation_data(evaluations, result_file, i)
604
+ end
605
+ end
606
+
607
+ #evaluation_data_of(evaluations)
608
+ evaluations
609
+ rescue => error
610
+ raise "Error while cross validation for equally distributed instances: #{error}"
611
+ ensure
612
+ result_file.close
613
+ puts "\nThe evaluation results has been saved to #{file_path}"
614
+ end
615
+ end
616
+
617
+ # Returns the evaluation data average value hash of the given evaluations.
618
+ def evaluation_data_of(evaluations)
619
+ class_index = Instances::VANDALISM_CLASS_INDEX
620
+ total_count = evaluations.count.to_f
621
+
622
+ recall = evaluations.reduce(0.0) { |result, sample| result + sample.recall(class_index) } / total_count
623
+ precision = evaluations.reduce(0.0) { |result, sample| result + sample.precision(class_index) } / total_count
624
+ area_under_prc = evaluations.reduce(0.0) { |result, sample| result + sample.area_under_prc(class_index) } / total_count
625
+
626
+ {
627
+ precision: precision,
628
+ recall: recall,
629
+ area_under_prc: area_under_prc
630
+ }
631
+ end
632
+
633
+ # Prints data to file
634
+ def print_evaluation_data(evaluations, file, index)
635
+ data = evaluation_data_of(evaluations)
636
+ file.puts "#{index}\tprecision: #{data[:precision]} | recall: #{data[:recall]} | Area under PRC: #{data[:area_under_prc]}"
637
+ end
638
+ end
639
+ end
640
+ end
@@ -0,0 +1,47 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ # @abstract Exceptions raised by Wikipedia::VandalismDetection inherit from
4
+ # this Error
5
+ class Error < StandardError; end
6
+
7
+ # Exception is raised when trying to classify without a configured
8
+ # classifier
9
+ class ClassifierNotConfiguredError < Error; end
10
+
11
+ # Exception is raised when tyring to classifiy with an unknown classifier
12
+ class ClassifierUnknownError < Error; end
13
+
14
+ # Exception is raised when trying to use features without having configured
15
+ # some
16
+ class FeaturesNotConfiguredError < Error; end
17
+
18
+ # Exception is raised when trying to use edits file without having
19
+ # configured some
20
+ class EditsFileNotConfiguredError < Error; end
21
+
22
+ # Exception is raised when trying to use annotations file without having
23
+ # configured some
24
+ class AnnotationsFileNotConfiguredError < Error; end
25
+
26
+ # Exception is raised when trying to read revisions directory without
27
+ # having configured some
28
+ class RevisionsDirectoryNotConfiguredError < Error; end
29
+
30
+ # Exception is raised when trying to classify without a configured ground
31
+ # thruth test file
32
+ class GroundTruthFileNotConfiguredError < Error; end
33
+
34
+ # Exception is raises when there is no arff file available
35
+ class ArffFileNotFoundError < Error; end
36
+
37
+ # Exception is raises when there is no ground truth file available
38
+ class GroundTruthFileNotFoundError < Error; end
39
+
40
+ # Exception is raised when an already available feature should be added to
41
+ # the arff file
42
+ class FeatureAlreadyUsedError < Error; end
43
+
44
+ # Exception is raised when a revisions text file cannot be found and loaded
45
+ class RevisionFileNotFound < Error; end
46
+ end
47
+ end