wikipedia-vandalism_detection 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,640 @@
1
+ require 'wikipedia/vandalism_detection/configuration'
2
+ require 'wikipedia/vandalism_detection/exceptions'
3
+ require 'wikipedia/vandalism_detection/training_dataset'
4
+ require 'wikipedia/vandalism_detection/test_dataset'
5
+ require 'wikipedia/vandalism_detection/classifier'
6
+ require 'wikipedia/vandalism_detection/instances'
7
+ require 'weka'
8
+ require 'fileutils'
9
+ require 'csv'
10
+
11
+ module Wikipedia
12
+ module VandalismDetection
13
+ # This class provides methods for the evaluation of a
14
+ # Wikipedia::VandalismDetection::Classifier using the weka framwork.
15
+ #
16
+ # @example
17
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
18
+ # evaluator = Wikipedia::VandalsimDetection::Evaluator(classifier)
19
+ #
20
+ # evaluation = evaluator.cross_validate
21
+ # evaluation = evaluator.cross_validate(equally_distributed: true)
22
+ #
23
+ # puts evaluation[:precision]
24
+ # puts evaluation[:recall]
25
+ # puts evaluation[:area_under_prc]
26
+ class Evaluator
27
+ DEFAULT_SAMPLE_COUNT = 200
28
+ DEFAULTS = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS
29
+
30
+ def initialize(classifier)
31
+ unless classifier.is_a?(Wikipedia::VandalismDetection::Classifier)
32
+ message = 'The classifier argument has to be an instance of ' \
33
+ 'Wikipedia::VandalismDetection::Classifier'
34
+ raise ArgumentError, message
35
+ end
36
+
37
+ @config = Wikipedia::VandalismDetection.config
38
+ @classifier = classifier
39
+ @classifier_instance = classifier.classifier_instance
40
+ end
41
+
42
+ # Cross validates the classifier.
43
+ # Fold is used as defined in configuration (default is 10).
44
+ #
45
+ # @example
46
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
47
+ # evaluation = classifier.cross_validate
48
+ # evaluation = classifier.cross_validate(equally_distributed: true)
49
+ #
50
+ def cross_validate(options = {})
51
+ equally_distributed = options[:equally_distributed]
52
+
53
+ fold_defaults = DEFAULTS['classifier']['cross-validation-fold']
54
+ fold = @config.cross_validation_fold || fold_defaults
55
+
56
+ if equally_distributed
57
+ cross_validate_equally_distributed(fold)
58
+ else
59
+ cross_validate_all_instances(fold)
60
+ end
61
+ end
62
+
63
+ # Returns a Hash comprising the evaluation curve data Arrays for precision, recall
64
+ #
65
+ # @example
66
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
67
+ # evaluator = classifier.evaluator
68
+ # or
69
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
70
+ # evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
71
+ #
72
+ # curve_data = evaluator.curve_data
73
+ #
74
+ # curve_data[:precision]
75
+ # # => [0.76, ..., 0.91]
76
+ #
77
+ # curve_data[:recall]
78
+ # # => [0.87, ..., 0.89]
79
+ #
80
+ # curve_data[:area_under_prc]
81
+ # # => 0.83
82
+ def curve_data(options = {})
83
+ evaluations = cross_validate(options)
84
+ threshold_curve = Weka::Classifiers::Evaluation::ThresholdCurve.new
85
+
86
+ evaluation_data = evaluations.is_a?(Array) ? evaluations[0] : evaluations
87
+
88
+ instances = threshold_curve.curve(
89
+ evaluation_data.predictions,
90
+ Instances::VANDALISM_CLASS_INDEX
91
+ )
92
+
93
+ precision = instances.attribute_to_double_array(0).to_a
94
+ recall = instances.attribute_to_double_array(1).to_a
95
+ area_under_prc = evaluation_data.area_under_prc(Instances::VANDALISM_CLASS_INDEX)
96
+
97
+ {
98
+ precision: precision,
99
+ recall: recall,
100
+ area_under_prc: area_under_prc
101
+ }
102
+ end
103
+
104
+ # Evaluates the classification of the configured test corpus against the
105
+ # given ground truth.
106
+ # Runs the file creation automatically unless the classification file
107
+ # exists, yet.
108
+ #
109
+ # Number of samples to use can be set by 'sample_count: <number>'
110
+ # option. Default number of samples is 100.
111
+ #
112
+ # Returns a Hash with values:
113
+ # :recalls - recall values
114
+ # :precisions - precision values
115
+ # :fp_rates - fals positive rate values
116
+ # :auprc - area under precision recall curve
117
+ # :auroc - area under receiver operator curve
118
+ # :total_recall - overall classifier recall value
119
+ # :total_precision - overall classifier precision value
120
+ #
121
+ # @example
122
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
123
+ # evaluator = classifier.evaluator
124
+ # or
125
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
126
+ # evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
127
+ #
128
+ # evaluator.evaluate_testcorpus_classification
129
+ # evaluator.evaluate_testcorpus_classification(sample_count: 50)
130
+ #
131
+ def evaluate_testcorpus_classification(options = {})
132
+ ground_truth_file_path = @config.test_corpus_ground_truth_file
133
+
134
+ unless ground_truth_file_path
135
+ message = 'Ground truth file path has to be set for test set evaluation'
136
+ raise GroundTruthFileNotConfiguredError, message
137
+ end
138
+
139
+ unless File.exist?(ground_truth_file_path)
140
+ message = 'Configured ground truth file is not available.'
141
+ raise GroundTruthFileNotFoundError, message
142
+ end
143
+
144
+ ground_truth = ground_truth_hash(ground_truth_file_path)
145
+ create_testcorpus_classification_file!(@config.test_output_classification_file, ground_truth)
146
+ classification = classification_hash(@config.test_output_classification_file)
147
+
148
+ sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
149
+ curves = test_performance_curves(ground_truth, classification, sample_count)
150
+ precision_recall = maximum_precision_recall(curves[:precisions], curves[:recalls])
151
+
152
+ curves[:total_recall] = precision_recall[:recall]
153
+ curves[:total_precision] = precision_recall[:precision]
154
+
155
+ curves
156
+ end
157
+
158
+ # Returns the performance curve points (recall, precision, fp-rate) and
159
+ # computed area under curves.
160
+ def test_performance_curves(ground_truth, classification, sample_count)
161
+ thresholds = (0.0...1.0).step(1.0 / sample_count.to_f).to_a
162
+
163
+ # remove first value to not use the [0,1] value in curve
164
+ thresholds.shift
165
+
166
+ precisions = []
167
+ recalls = []
168
+ fp_rates = []
169
+
170
+ thresholds.each do |threshold|
171
+ values = predictive_values(ground_truth, classification, threshold)
172
+ performance_params = performance_parameters(
173
+ values[:tp],
174
+ values[:fp],
175
+ values[:tn],
176
+ values[:fn]
177
+ )
178
+
179
+ precisions.push performance_params[:precision]
180
+ recalls.push performance_params[:recall]
181
+ fp_rates.push performance_params[:fp_rate]
182
+ end
183
+
184
+ tp_rates = recalls
185
+ pr_sorted = sort_curve_values(recalls, precisions, x: 0.0, y: 0.0)
186
+ roc_sorted = sort_curve_values(fp_rates, tp_rates, y: 0.0, x: 1.0)
187
+
188
+ recalls = pr_sorted[:x]
189
+ precisions = pr_sorted[:y]
190
+ fp_rates = roc_sorted[:x]
191
+ tp_rates = roc_sorted[:y]
192
+
193
+ pr_auc = area_under_curve(recalls, precisions)
194
+ roc_auc = area_under_curve(fp_rates, tp_rates)
195
+
196
+ {
197
+ precisions: precisions, recalls: recalls,
198
+ fp_rates: fp_rates, tp_rates: tp_rates,
199
+ pr_auc: pr_auc, roc_auc: roc_auc
200
+ }
201
+ end
202
+
203
+ # Returns the predictive values hash (TP,FP, TN, FN) for a certain
204
+ # threshold.
205
+ def predictive_values(ground_truth, classification, threshold)
206
+ tp = 0 # vandalism which is classified as vandalism
207
+ fp = 0 # regular that is classified as vandalism
208
+ tn = 0 # regular that is classified as regular
209
+ fn = 0 # vandalism that is classified as regular
210
+
211
+ ground_truth.each do |sample|
212
+ values = sample[1]
213
+ target_class = values[:class]
214
+
215
+ key = :"#{values[:old_revision_id]}-#{values[:new_revision_id]}"
216
+ # go on if annotated is not in classification
217
+ next unless classification.key?(key)
218
+
219
+ confidence = classification[key][:confidence]
220
+
221
+ tp += 1 if Evaluator.true_positive?(target_class, confidence, threshold) # True Positives
222
+ fn += 1 if Evaluator.false_negative?(target_class, confidence, threshold) # False Negatives
223
+ fp += 1 if Evaluator.false_positive?(target_class, confidence, threshold) # False Positives
224
+ tn += 1 if Evaluator.true_negative?(target_class, confidence, threshold) # True Negatives
225
+ end
226
+
227
+ { tp: tp, fp: fp, tn: tn, fn: fn }
228
+ end
229
+
230
+ # Returns whether the given confidence value represents a
231
+ # true positive (TP) regarding the given target class and threshold.
232
+ def self.true_positive?(target_class, confidence, threshold)
233
+ target_class == Instances::VANDALISM_SHORT && confidence.to_f > threshold.to_f
234
+ end
235
+
236
+ # Returns whether the given confidence value represents a
237
+ # true negative (TN) regarding the given target class and threshold.
238
+ def self.true_negative?(target_class, confidence, threshold)
239
+ target_class == Instances::REGULAR_SHORT && confidence.to_f < threshold.to_f
240
+ end
241
+
242
+ # Returns whether the given confidence value represents a
243
+ # false positive (FP) regarding the given target class and threshold.
244
+ def self.false_positive?(target_class, confidence, threshold)
245
+ target_class == Instances::REGULAR_SHORT && confidence.to_f >= threshold.to_f
246
+ end
247
+
248
+ # Returns whether the given confidence value represents a
249
+ # false negative (FN) regarding the given target class and threshold.
250
+ def self.false_negative?(target_class, confidence, threshold)
251
+ target_class == Instances::VANDALISM_SHORT && confidence.to_f <= threshold.to_f
252
+ end
253
+
254
+ # Returns a hash with performance parameters computed from given
255
+ # TP, FP, TN, FN
256
+ def performance_parameters(tp, fp, tn, fn)
257
+ precision = (tp + fp).zero? ? 1.0 : tp.to_f / (tp.to_f + fp.to_f)
258
+ recall = (tp + fn).zero? ? 1.0 : tp.to_f / (tp.to_f + fn.to_f)
259
+ fp_rate = (fp + tn).zero? ? 1.0 : fp.to_f / (fp.to_f + tn.to_f)
260
+
261
+ {
262
+ precision: precision,
263
+ recall: recall,
264
+ fp_rate: fp_rate
265
+ }
266
+ end
267
+
268
+ # Returns the calculated area under curve for given point values
269
+ # x and y values has to be float arrays of the same length.
270
+ def area_under_curve(x_values, y_values)
271
+ unless x_values.count == y_values.count
272
+ raise ArgumentError, 'x and y values must have the same length!'
273
+ end
274
+
275
+ sum = 0.0
276
+ last_index = x_values.size - 1
277
+
278
+ # trapezoid area formular: A = 1/2 * (b1 + b2) * h
279
+ x_values.each_with_index do |x, index|
280
+ break if index == last_index
281
+
282
+ h = x_values[index + 1] - x
283
+ b1 = y_values[index]
284
+ b2 = y_values[index + 1]
285
+
286
+ sum += 0.5 * (b1 + b2) * h
287
+ end
288
+
289
+ sum.abs
290
+ end
291
+
292
+ # Returns given value array sorted by first array (x_values)
293
+ # Return value is a Hash { x: <x_values_sorted>, y: <y_values_sorted_by_x> }
294
+ # start_value is added in front of arrays if set, e.g. {x: 0.0, y: 1.0}
295
+ # end_values is added to end of arrays if set, e.g. {x: 1.0, y: 1.0 }
296
+ #
297
+ # @example
298
+ # evaluator.sort_curve_values(x, y, { x: 0.0, y: 0.0 }, { x: 1.0, y: 1.0 })
299
+ # #=>Hash { x: [0.0, *x, 1.0], y: [0.0, *y, 1.0] }
300
+ def sort_curve_values(x_values, y_values, start_values = nil, end_values = nil)
301
+ merge_sorted = x_values.each_with_index.map { |x, index| [x, y_values[index]] }
302
+ merge_sorted = merge_sorted.sort_by { |values| [values[0], - values[1]] }.uniq
303
+
304
+ x = merge_sorted.transpose[0]
305
+ y = merge_sorted.transpose[1]
306
+
307
+ start_values_set = start_values && (start_values.key?(:x) || start_values.key?(:y))
308
+ end_values_set = end_values && (end_values.key?(:x) || end_values.key?(:y))
309
+
310
+ if start_values_set
311
+ unless x.first == start_values[:x] && y.first == start_values[:y]
312
+ x.unshift(start_values[:x] || x.first)
313
+ y.unshift(start_values[:y] || y.first)
314
+ end
315
+ end
316
+
317
+ if end_values_set
318
+ unless x.last == end_values[:x] && y.last == end_values[:y]
319
+ x.push(end_values[:x] || x.last)
320
+ y.push(end_values[:y] || y.last)
321
+ end
322
+ end
323
+
324
+ { x: x, y: y }
325
+ end
326
+
327
+ # Returns the maximum precision recall pair
328
+ def maximum_precision_recall(precisions, recalls)
329
+ areas = precisions.each_with_index.map do |precision, index|
330
+ [precision * recalls[index], index]
331
+ end
332
+
333
+ # remove arrays with NaN values
334
+ areas.select! { |b| b.all? { |f| !f.to_f.nan? } }
335
+ max_index = areas.sort.max[1]
336
+
337
+ { precision: precisions[max_index], recall: recalls[max_index] }
338
+ end
339
+
340
+ # Creates the test corpus text file by classifying the configured test
341
+ # samples. All sub steps (as creating the test arff file, etc.) are run
342
+ # automatically if needed.
343
+ def create_testcorpus_classification_file!(file_path, ground_truth_data)
344
+ if ground_truth_data.nil?
345
+ raise ArgumentError, 'Ground truth data hash is not allowed to be nil'
346
+ end
347
+
348
+ dataset = TestDataset.build!
349
+
350
+ dir_name = File.dirname(file_path)
351
+ FileUtils.mkdir_p(dir_name) unless Dir.exist?(dir_name)
352
+ file = File.open(file_path, 'w')
353
+
354
+ feature_names = dataset.attribute_names.map(&:upcase)[0...-2]
355
+ header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *feature_names].join(' ')
356
+
357
+ file.puts header
358
+
359
+ dataset.to_m.to_a.each do |instance|
360
+ features = instance[0...-3]
361
+ old_revision_id = instance[-3].to_i
362
+ new_revision_id = instance[-2].to_i
363
+ ground_truth_class_name = Instances::CLASSES_SHORT[Instances::CLASSES.key(instance[-1])]
364
+
365
+ classification = @classifier.classify(features, return_all_params: true)
366
+
367
+ if classification[:class_index] == Instances::VANDALISM_CLASS_INDEX
368
+ class_value = 1.0
369
+ elsif classification[:class_index] == Instances::REGULAR_CLASS_INDEX
370
+ class_value = 0.0
371
+ else
372
+ class_value = Features::MISSING_VALUE
373
+ end
374
+
375
+ confidence = classification[:confidence] || class_value
376
+
377
+ must_be_inverted = @config.use_occ? && !!(@classifier.classifier_instance.options =~ /#{Instances::VANDALISM}/)
378
+ confidence_value = must_be_inverted ? 1.0 - confidence : confidence
379
+ features = features.join(' ').gsub(Float::NAN.to_s, Features::MISSING_VALUE).split
380
+
381
+ file.puts [
382
+ old_revision_id,
383
+ new_revision_id,
384
+ ground_truth_class_name,
385
+ confidence_value,
386
+ *features
387
+ ].join(' ')
388
+ end
389
+
390
+ file.close
391
+ end
392
+
393
+ # Returns a hash comprising each feature's predictive values analysis for
394
+ # different thresholds.
395
+ # The Hash structure is the following one:
396
+ # {
397
+ # feature_name_1:
398
+ # {
399
+ # 0.0 => {fp: , fn: , tp: , tn: },
400
+ # ... => {fp: , fn: , tp: , tn: },
401
+ # 1.0 => {fp: , fn: , tp: , tn: }
402
+ # },
403
+ # ...,
404
+ # feature_name_n:
405
+ # {
406
+ # 0.0 => {fp: , fn: , tp: , tn: },
407
+ # ... => {fp: , fn: , tp: , tn: },
408
+ # 1.0 => {fp: , fn: , tp: , tn: }
409
+ # },
410
+ # }
411
+ def feature_analysis(options = {})
412
+ sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
413
+ thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
414
+
415
+ ground_truth_file_path = @config.test_corpus_ground_truth_file
416
+ training_dataset = TrainingDataset.instances
417
+ test_dataset = TestDataset.build!
418
+
419
+ analysis = {}
420
+
421
+ @config.features.each_with_index do |feature_name, index|
422
+ puts "analyzing feature… '#{feature_name}'"
423
+
424
+ dataset = filter_single_attribute(training_dataset, index)
425
+ print ' | train classifier with feature data…'
426
+ classifier = Classifier.new(dataset)
427
+ print "done \n"
428
+
429
+ classification = classification_data(classifier, test_dataset)
430
+ ground_truth = ground_truth_hash(ground_truth_file_path)
431
+
432
+ values = {}
433
+
434
+ thresholds.each do |threshold|
435
+ values[threshold] = predictive_values(ground_truth, classification, threshold)
436
+ end
437
+
438
+ analysis[feature_name] = values
439
+ end
440
+
441
+ analysis
442
+ end
443
+
444
+ # Returns a hash comprising the classifiers predictive values for using
445
+ # all configured features for different thresholds.
446
+ def full_analysis(options = {})
447
+ sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
448
+ thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
449
+
450
+ ground_truth_file_path = @config.test_corpus_ground_truth_file
451
+
452
+ puts 'train classifier…'
453
+ classifier = Classifier.new
454
+
455
+ test_dataset = TestDataset.build!
456
+
457
+ puts 'computing classification…'
458
+ classification = classification_data(classifier, test_dataset)
459
+ ground_truth = ground_truth_hash(ground_truth_file_path)
460
+
461
+ analysis = {}
462
+
463
+ thresholds.each do |threshold|
464
+ analysis[threshold] = predictive_values(ground_truth, classification, threshold)
465
+ end
466
+
467
+ print "done\n"
468
+ analysis
469
+ end
470
+
471
+ private
472
+
473
+ # Returns a dataset only holding the attribute at the given index.
474
+ # Weka Unsupervised Attribute Remove filter is used.
475
+ def filter_single_attribute(dataset, attribute_index)
476
+ filter = Weka::Filters::Unsupervised::Attribute::Remove.new
477
+ filter.use_options("-V -R #{attribute_index + 1},#{dataset.class_index + 1}")
478
+
479
+ filtered = filter.filter(dataset)
480
+ filtered.class_index = filtered.attributes_count - 1
481
+ filtered
482
+ end
483
+
484
+ # Returns an array of classification confidences of the test corpus'
485
+ # classification with the given classifier
486
+ def classification_data(classifier, test_dataset)
487
+ classification = {}
488
+
489
+ test_dataset.to_m.to_a.each do |instance|
490
+ features = instance[0...-3]
491
+
492
+ old_revision_id = instance[-3].to_i
493
+ new_revision_id = instance[-2].to_i
494
+
495
+ params = classifier.classify(features, return_all_params: true)
496
+ class_short_name = Instances::CLASSES_SHORT[params[:class_index]]
497
+
498
+ must_be_inverted = @config.use_occ? && @classifier.classifier_instance.options !~ /#{Instances::VANDALISM}/
499
+ confidence = must_be_inverted ? 1.0 - params[:confidence] : params[:confidence]
500
+
501
+ classification[:"#{old_revision_id}-#{new_revision_id}"] = {
502
+ old_revision_id: old_revision_id,
503
+ new_revision_id: new_revision_id,
504
+ class: class_short_name,
505
+ confidence: confidence
506
+ }
507
+ end
508
+
509
+ classification
510
+ end
511
+
512
+ # Returns a hash for classification data from given classification file
513
+ def classification_hash(classification_file)
514
+ file = File.read(classification_file)
515
+ classification_samples = file.lines.to_a
516
+ classification_samples.shift # remove header line
517
+
518
+ classification = {}
519
+
520
+ classification_samples.each do |line|
521
+ line_parts = line.split(' ')
522
+
523
+ old_revision_id = line_parts[0].to_i
524
+ new_revision_id = line_parts[1].to_i
525
+ class_short = line_parts[2]
526
+ confidence = line_parts[3].to_f
527
+
528
+ classification[:"#{old_revision_id}-#{new_revision_id}"] = {
529
+ old_revision_id: old_revision_id,
530
+ new_revision_id: new_revision_id,
531
+ class: class_short,
532
+ confidence: confidence
533
+ }
534
+ end
535
+
536
+ classification
537
+ end
538
+
539
+ # Returns a hash for classification data from given ground truth file
540
+ def ground_truth_hash(ground_truth_file)
541
+ file = File.read(ground_truth_file)
542
+ ground_truth_samples = file.lines.to_a
543
+
544
+ ground_truth = {}
545
+
546
+ ground_truth_samples.each do |line|
547
+ line_parts = line.split(' ')
548
+
549
+ old_revision_id = line_parts[0].to_i
550
+ new_revision_id = line_parts[1].to_i
551
+ class_short = line_parts[2]
552
+
553
+ ground_truth[:"#{old_revision_id}-#{new_revision_id}"] = {
554
+ old_revision_id: old_revision_id,
555
+ new_revision_id: new_revision_id,
556
+ class: class_short
557
+ }
558
+ end
559
+
560
+ ground_truth
561
+ end
562
+
563
+ # Cross validates classifier over full dataset with <fold>-fold cross
564
+ # validation
565
+ def cross_validate_all_instances(fold)
566
+ @classifier_instance.cross_validate(folds: fold)
567
+ rescue => error
568
+ raise "Error while cross validation: #{error}"
569
+ end
570
+
571
+ # Cross validates classifier over equally distributed dataset with
572
+ # <fold>-fold cross validation
573
+ def cross_validate_equally_distributed(fold)
574
+ dirname = @config.output_base_directory
575
+ FileUtils.mkdir(dirname) unless Dir.exist?(dirname)
576
+
577
+ file_name = 'cross_validation_eq_distr.txt'
578
+ file_path = File.join(dirname, file_name)
579
+
580
+ puts "Writing to #{file_path}…"
581
+ result_file = File.open(file_path, 'a')
582
+
583
+ begin
584
+ time = Time.now.strftime('%Y-%m-%d %H:%M')
585
+ type = @config.classifier_type
586
+ options = @config.classifier_options || 'default'
587
+ result_file.puts "\nCROSS VALIDATION - #{fold} fold (Classifier: #{type}, options: #{options} ) | #{time}"
588
+ result_file.puts "Features: \n\t#{@config.features.join("\n\t")}\n\n"
589
+
590
+ evaluations = []
591
+
592
+ times = 10
593
+
594
+ # run n times validation
595
+ (1..times).each do |i|
596
+ uniform_dataset = TrainingDataset.balanced_instances
597
+
598
+ print "\rcross validate dataset (equally distributed)… #{i}/#{times} | instances: #{uniform_dataset.size}"
599
+ @classifier_instance.train_with_instances(uniform_dataset)
600
+ evaluations << @classifier_instance.cross_validate(folds: fold)
601
+
602
+ if (i % (times / 10)).zero?
603
+ print_evaluation_data(evaluations, result_file, i)
604
+ end
605
+ end
606
+
607
+ #evaluation_data_of(evaluations)
608
+ evaluations
609
+ rescue => error
610
+ raise "Error while cross validation for equally distributed instances: #{error}"
611
+ ensure
612
+ result_file.close
613
+ puts "\nThe evaluation results has been saved to #{file_path}"
614
+ end
615
+ end
616
+
617
+ # Returns the evaluation data average value hash of the given evaluations.
618
+ def evaluation_data_of(evaluations)
619
+ class_index = Instances::VANDALISM_CLASS_INDEX
620
+ total_count = evaluations.count.to_f
621
+
622
+ recall = evaluations.reduce(0.0) { |result, sample| result + sample.recall(class_index) } / total_count
623
+ precision = evaluations.reduce(0.0) { |result, sample| result + sample.precision(class_index) } / total_count
624
+ area_under_prc = evaluations.reduce(0.0) { |result, sample| result + sample.area_under_prc(class_index) } / total_count
625
+
626
+ {
627
+ precision: precision,
628
+ recall: recall,
629
+ area_under_prc: area_under_prc
630
+ }
631
+ end
632
+
633
+ # Prints data to file
634
+ def print_evaluation_data(evaluations, file, index)
635
+ data = evaluation_data_of(evaluations)
636
+ file.puts "#{index}\tprecision: #{data[:precision]} | recall: #{data[:recall]} | Area under PRC: #{data[:area_under_prc]}"
637
+ end
638
+ end
639
+ end
640
+ end
@@ -0,0 +1,47 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ # @abstract Exceptions raised by Wikipedia::VandalismDetection inherit from
4
+ # this Error
5
+ class Error < StandardError; end
6
+
7
+ # Exception is raised when trying to classify without a configured
8
+ # classifier
9
+ class ClassifierNotConfiguredError < Error; end
10
+
11
+ # Exception is raised when tyring to classifiy with an unknown classifier
12
+ class ClassifierUnknownError < Error; end
13
+
14
+ # Exception is raised when trying to use features without having configured
15
+ # some
16
+ class FeaturesNotConfiguredError < Error; end
17
+
18
+ # Exception is raised when trying to use edits file without having
19
+ # configured some
20
+ class EditsFileNotConfiguredError < Error; end
21
+
22
+ # Exception is raised when trying to use annotations file without having
23
+ # configured some
24
+ class AnnotationsFileNotConfiguredError < Error; end
25
+
26
+ # Exception is raised when trying to read revisions directory without
27
+ # having configured some
28
+ class RevisionsDirectoryNotConfiguredError < Error; end
29
+
30
+ # Exception is raised when trying to classify without a configured ground
31
+ # thruth test file
32
+ class GroundTruthFileNotConfiguredError < Error; end
33
+
34
+ # Exception is raises when there is no arff file available
35
+ class ArffFileNotFoundError < Error; end
36
+
37
+ # Exception is raises when there is no ground truth file available
38
+ class GroundTruthFileNotFoundError < Error; end
39
+
40
+ # Exception is raised when an already available feature should be added to
41
+ # the arff file
42
+ class FeatureAlreadyUsedError < Error; end
43
+
44
+ # Exception is raised when a revisions text file cannot be found and loaded
45
+ class RevisionFileNotFound < Error; end
46
+ end
47
+ end