wikipedia-vandalism_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.txt +4 -0
  5. data/README.md +265 -0
  6. data/Rakefile +12 -0
  7. data/lib/java/LibSVM.jar +0 -0
  8. data/lib/java/SMOTE.jar +0 -0
  9. data/lib/java/balancedRandomForest.jar +0 -0
  10. data/lib/java/diffutils-1.3.0.jar +0 -0
  11. data/lib/java/libsvm.jar +0 -0
  12. data/lib/java/oneClassClassifier.jar +0 -0
  13. data/lib/java/realAdaBoost.jar +0 -0
  14. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  15. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  16. data/lib/weka/classifiers/functions/lib_svm.rb +15 -0
  17. data/lib/weka/classifiers/meta/one_class_classifier.rb +25 -0
  18. data/lib/weka/classifiers/meta/real_ada_boost.rb +17 -0
  19. data/lib/weka/classifiers/trees/balanced_random_forest.rb +18 -0
  20. data/lib/weka/filters/supervised/instance/smote.rb +22 -0
  21. data/lib/wikipedia.rb +51 -0
  22. data/lib/wikipedia/vandalism_detection.rb +30 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +18 -0
  24. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +69 -0
  25. data/lib/wikipedia/vandalism_detection/classifier.rb +186 -0
  26. data/lib/wikipedia/vandalism_detection/configuration.rb +321 -0
  27. data/lib/wikipedia/vandalism_detection/diff.rb +27 -0
  28. data/lib/wikipedia/vandalism_detection/edit.rb +75 -0
  29. data/lib/wikipedia/vandalism_detection/evaluator.rb +606 -0
  30. data/lib/wikipedia/vandalism_detection/exceptions.rb +40 -0
  31. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +89 -0
  32. data/lib/wikipedia/vandalism_detection/features.rb +67 -0
  33. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +23 -0
  34. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +22 -0
  35. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +19 -0
  36. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +29 -0
  37. data/lib/wikipedia/vandalism_detection/features/article_size.rb +18 -0
  38. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +23 -0
  39. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/base.rb +54 -0
  41. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +23 -0
  42. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +22 -0
  43. data/lib/wikipedia/vandalism_detection/features/blanking.rb +25 -0
  44. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +25 -0
  45. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +22 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +22 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +17 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +27 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +22 -0
  51. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +22 -0
  52. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +22 -0
  53. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +27 -0
  54. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +18 -0
  55. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +18 -0
  56. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +24 -0
  57. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +65 -0
  58. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  59. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  60. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +20 -0
  61. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +22 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +22 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +18 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  65. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +20 -0
  66. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +18 -0
  67. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +20 -0
  68. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +27 -0
  69. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +27 -0
  70. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +24 -0
  71. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +18 -0
  72. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +23 -0
  73. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +23 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +23 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +22 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +27 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +28 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +23 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +23 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +20 -0
  83. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +23 -0
  84. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +18 -0
  85. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +22 -0
  86. data/lib/wikipedia/vandalism_detection/features/reverted.rb +18 -0
  87. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +29 -0
  89. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +23 -0
  90. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +21 -0
  91. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +22 -0
  92. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +26 -0
  93. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +25 -0
  94. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +31 -0
  95. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +22 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +24 -0
  97. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +31 -0
  98. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +24 -0
  99. data/lib/wikipedia/vandalism_detection/features/user_reputation.rb +38 -0
  100. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +23 -0
  101. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +22 -0
  102. data/lib/wikipedia/vandalism_detection/features/weekday.rb +21 -0
  103. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +22 -0
  104. data/lib/wikipedia/vandalism_detection/instances.rb +130 -0
  105. data/lib/wikipedia/vandalism_detection/page.rb +88 -0
  106. data/lib/wikipedia/vandalism_detection/page_parser.rb +52 -0
  107. data/lib/wikipedia/vandalism_detection/revision.rb +69 -0
  108. data/lib/wikipedia/vandalism_detection/revision_parser.rb +43 -0
  109. data/lib/wikipedia/vandalism_detection/test_dataset.rb +367 -0
  110. data/lib/wikipedia/vandalism_detection/text.rb +18 -0
  111. data/lib/wikipedia/vandalism_detection/training_dataset.rb +303 -0
  112. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  113. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists.rb +19 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +12 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +21 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +22 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +12 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +15 -0
  120. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +12 -0
  121. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +97 -0
  122. data/spec/factories/edit.rb +20 -0
  123. data/spec/factories/page.rb +13 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/config.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +22 -0
  152. data/spec/support/macros/file_reading.rb +7 -0
  153. data/spec/support/macros/test_configuration.rb +71 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +36 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +317 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +517 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +137 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +671 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +128 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +36 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +58 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +61 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +23 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +35 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +36 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +59 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +49 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +36 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +58 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +38 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +35 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +37 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +34 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +34 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +27 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +34 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +34 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +34 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +34 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +42 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +33 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +33 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +35 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +49 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +36 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +51 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +26 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +41 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +46 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +35 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +35 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +35 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +35 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +35 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +36 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +59 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +35 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +26 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +36 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +59 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +36 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +36 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +36 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +46 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +36 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +36 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +36 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +36 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +35 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +36 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +35 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +44 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +28 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +46 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +60 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +36 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +59 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +35 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +57 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +38 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +50 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +22 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +35 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +37 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +35 -0
  227. data/spec/vandalism_detection/features/user_reputation_spec.rb +52 -0
  228. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +36 -0
  229. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +58 -0
  230. data/spec/vandalism_detection/features/weekday_spec.rb +22 -0
  231. data/spec/vandalism_detection/features/words_increment_spec.rb +35 -0
  232. data/spec/vandalism_detection/instances_spec.rb +156 -0
  233. data/spec/vandalism_detection/page_parser_spec.rb +184 -0
  234. data/spec/vandalism_detection/page_spec.rb +135 -0
  235. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  236. data/spec/vandalism_detection/revision_spec.rb +115 -0
  237. data/spec/vandalism_detection/test_dataset_spec.rb +231 -0
  238. data/spec/vandalism_detection/text_spec.rb +29 -0
  239. data/spec/vandalism_detection/training_dataset_spec.rb +264 -0
  240. data/spec/vandalism_detection/wikitext_extractor_spec.rb +72 -0
  241. data/spec/weka/classifiers/functions/lib_svm_spec.rb +38 -0
  242. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +76 -0
  243. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +40 -0
  244. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +40 -0
  245. data/spec/weka/filters/supervised/instance/smote_spec.rb +6 -0
  246. data/wikipedia-vandalism_detection.gemspec +30 -0
  247. metadata +512 -0
@@ -0,0 +1,27 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+
4
+ require 'java'
5
+ require 'java/diffutils-1.3.0.jar'
6
+
7
+ java_import 'difflib.DiffUtils'
8
+
9
+ class Diff
10
+
11
+ def initialize(original, current)
12
+ @original = original.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
13
+ @current = current.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
14
+
15
+ @patch = DiffUtils.diff @original.split, @current.split
16
+ end
17
+
18
+ def inserted_words
19
+ @patch.deltas.map {|delta| delta.revised.lines }.flatten
20
+ end
21
+
22
+ def removed_words
23
+ @patch.deltas.map {|delta| delta.original.lines }.flatten
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,75 @@
1
+ require 'wikipedia/vandalism_detection/diff'
2
+ require 'wikipedia/vandalism_detection/text'
3
+ require 'wikipedia/vandalism_detection/page'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ class Edit
8
+
9
+ attr_reader :old_revision, :new_revision
10
+ attr_accessor :page
11
+
12
+ def initialize(old_revision, new_revision, attributes = {})
13
+ message = "old revision: #{old_revision.id} | parent: #{old_revision.parent_id},
14
+ new revision: #{new_revision.id} | parent: #{new_revision.parent_id}"
15
+
16
+ raise ArgumentError, "Revisions are not sequent: #{message}." unless sequent?(old_revision, new_revision)
17
+
18
+ @old_revision = old_revision
19
+ @new_revision = new_revision
20
+ @page = attributes[:page] || Page.new
21
+ end
22
+
23
+ def serialize(*attributes)
24
+ old_revision_parts = []
25
+ new_revision_parts = []
26
+
27
+ attributes.each do |attr|
28
+ if @old_revision.respond_to?(attr)
29
+ old_revision_parts.push @old_revision.method(attr).call
30
+ end
31
+ end
32
+
33
+ attributes.each do |attr|
34
+ if @new_revision.respond_to?(attr)
35
+ new_revision_parts.push @new_revision.method(attr).call
36
+ end
37
+ end
38
+
39
+ old_revision_string = old_revision_parts.join(',')
40
+ new_revision_string = new_revision_parts.join(',')
41
+
42
+ "#{old_revision_string}\t#{new_revision_string}"
43
+ end
44
+
45
+ # Returns an array of the words inserted in the new revision compared with the old one.
46
+ def inserted_words
47
+ @diff ||= Diff.new(@old_revision.text, @new_revision.text)
48
+ @inserted_words ||= @diff.inserted_words
49
+ end
50
+
51
+ # Returns a Text of the words inserted in the new revision compared with the old one.
52
+ def inserted_text
53
+ @inserted_text ||= Text.new(inserted_words.join(' '))
54
+ end
55
+
56
+ # Returns an array of the words removed in the new revision compared with the old one.
57
+ def removed_words
58
+ @diff ||= Diff.new(@old_revision.text, @new_revision.text)
59
+ @removed_words ||= @diff.removed_words
60
+ end
61
+
62
+ # Returns a Text of the words removed in the new revision compared with the old one.
63
+ def removed_text
64
+ @removed_text ||= Text.new(removed_words.join(' '))
65
+ end
66
+
67
+ protected
68
+
69
+ # Returns whether the given revisions are sequent, i.e. the old revisions id is the the new revisions parent id.
70
+ def sequent?(old_revision, new_revision)
71
+ new_revision.parent_id == old_revision.id
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,606 @@
1
+ require 'wikipedia/vandalism_detection/configuration'
2
+ require 'wikipedia/vandalism_detection/exceptions'
3
+ require 'wikipedia/vandalism_detection/training_dataset'
4
+ require 'wikipedia/vandalism_detection/test_dataset'
5
+ require 'wikipedia/vandalism_detection/classifier'
6
+ require 'wikipedia/vandalism_detection/instances'
7
+ require 'ruby-band'
8
+ require 'fileutils'
9
+ require 'csv'
10
+
11
+ module Wikipedia
12
+ module VandalismDetection
13
+
14
+ # This class provides methods for the evaluation of a Wikipedia::VandalismDetection::Classifier
15
+ # using the weka framwork.
16
+ #
17
+ # @example
18
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
19
+ # evaluator = Wikipedia::VandalsimDetection::Evaluator(classifier)
20
+ #
21
+ # evaluation = evaluator.cross_validate
22
+ # evaluation = evaluator.cross_validate(equally_distributed: true)
23
+ #
24
+ # puts evaluation[:precision]
25
+ # puts evaluation[:recall]
26
+ # puts evaluation[:area_under_prc]
27
+ class Evaluator
28
+
29
+ DEFAULT_SAMPLE_COUNT = 200.freeze
30
+
31
+ def initialize(classifier)
32
+ raise(ArgumentError, 'Classifier param has to be a Wikipedia::VandalismDetection::Classifier instance') unless
33
+ classifier.is_a?(Wikipedia::VandalismDetection::Classifier)
34
+
35
+ @config = Wikipedia::VandalismDetection.configuration
36
+ @classifier = classifier
37
+ @classifier_instance = classifier.classifier_instance
38
+ end
39
+
40
+ # Cross validates the classifier.
41
+ # Fold is used as defined in configuration (default is 10).
42
+ #
43
+ # @example
44
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
45
+ # evaluation = classifier.cross_validate
46
+ # evaluation = classifier.cross_validate(equally_distributed: true)
47
+ #
48
+ def cross_validate(options = {})
49
+ equally_distributed = options[:equally_distributed]
50
+
51
+ fold_defaults = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS['classifier']['cross-validation-fold']
52
+ fold = (@config.cross_validation_fold || fold_defaults)
53
+
54
+ if equally_distributed
55
+ cross_validate_equally_distributed(fold)
56
+ else
57
+ cross_validate_all_instances(fold)
58
+ end
59
+ end
60
+
61
+ # Returns a Hash comprising the evaluation curve data Arrays for precision, recall
62
+ #
63
+ # @example
64
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
65
+ # evaluator = classifier.evaluator
66
+ # or
67
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
68
+ # evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
69
+ #
70
+ # curve_data = evaluator.curve_data
71
+ #
72
+ # curve_data[:precision]
73
+ # # => [0.76, ..., 0.91]
74
+ #
75
+ # curve_data[:recall]
76
+ # # => [0.87, ..., 0.89]
77
+ #
78
+ # curve_data[:area_under_prc]
79
+ # # => 0.83
80
+ def curve_data(options = {})
81
+ evaluations = cross_validate(options)
82
+ threshold_curve = Weka::Classifiers::Evaluation::ThresholdCurve.new
83
+
84
+ evaluation_data = (evaluations.is_a? Array) ? evaluations[0] : evaluations
85
+
86
+ instances = threshold_curve.curve(evaluation_data.predictions, Instances::VANDALISM_CLASS_INDEX)
87
+ precision = instances.return_attr_data('Precision')
88
+ recall = instances.return_attr_data('Recall')
89
+ area_under_prc = evaluation_data.area_under_prc(Instances::VANDALISM_CLASS_INDEX)
90
+
91
+ { precision: precision, recall: recall, area_under_prc: area_under_prc }
92
+ end
93
+
94
+ # Evaluates the classification of the configured test corpus against the given ground truth.
95
+ # Runs the file creation automatically unless the classification file exists, yet.
96
+ #
97
+ # Number of samples to use can be set by 'sample_count: <number>' parameter
98
+ # Default number of samples is 100.
99
+ #
100
+ # Returns a Hash with values:
101
+ # :recalls - recall values
102
+ # :precisions - precision values
103
+ # :fp_rates - fals positive rate values
104
+ # :auprc - area under precision recall curve
105
+ # :auroc - area under receiver operator curve
106
+ # :total_recall - overall classifier recall value
107
+ # :total_precision - overall classifier precision value
108
+ #
109
+ # @example
110
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
111
+ # evaluator = classifier.evaluator
112
+ # or
113
+ # classifier = Wikipedia::VandalismDetection::Classifier.new
114
+ # evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
115
+ #
116
+ # evaluator.evaluate_testcorpus_classification
117
+ # evaluator.evaluate_testcorpus_classification(sample_count: 50)
118
+ #
119
+ def evaluate_testcorpus_classification(options = {})
120
+ ground_truth_file_path = @config.test_corpus_ground_truth_file
121
+
122
+ raise(GroundTruthFileNotConfiguredError, 'Ground truth file path has to be set for test set evaluation!') \
123
+ unless ground_truth_file_path
124
+
125
+ raise(GroundTruthFileNotFoundError, 'Configured ground truth file is not available.') \
126
+ unless File.exist?(ground_truth_file_path)
127
+
128
+ ground_truth = ground_truth_hash(ground_truth_file_path)
129
+ create_testcorpus_classification_file!(@config.test_output_classification_file, ground_truth)
130
+ classification = classification_hash(@config.test_output_classification_file)
131
+
132
+ sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
133
+ curves = test_performance_curves(ground_truth, classification, sample_count)
134
+ precision_recall = maximum_precision_recall(curves[:precisions], curves[:recalls])
135
+
136
+ curves[:total_recall] = precision_recall[:recall]
137
+ curves[:total_precision] = precision_recall[:precision]
138
+
139
+ curves
140
+ end
141
+
142
+ # Returns the performance curve points (recall, precision, fp-rate) and computed area under curves.
143
+ def test_performance_curves(ground_truth, classification, sample_count)
144
+ thresholds = (0.0...1.0).step(1.0 / (sample_count.to_f)).to_a
145
+ thresholds.shift #remove first value to not use the [0,1] value in curve
146
+
147
+ precisions = []
148
+ recalls = []
149
+ fp_rates = []
150
+
151
+ thresholds.each do |threshold|
152
+ values = predictive_values(ground_truth, classification, threshold)
153
+ performance_params = performance_parameters(values[:tp], values[:fp], values[:tn], values[:fn])
154
+
155
+ precisions.push performance_params[:precision]
156
+ recalls.push performance_params[:recall]
157
+ fp_rates.push performance_params[:fp_rate]
158
+ end
159
+
160
+ tp_rates = recalls
161
+ pr_sorted = sort_curve_values(recalls, precisions, { x: 0.0 }, { y: 0.0 })
162
+ roc_sorted = sort_curve_values(fp_rates, tp_rates, { y: 0.0 }, { x: 1.0 })
163
+
164
+ recalls = pr_sorted[:x]
165
+ precisions = pr_sorted[:y]
166
+ fp_rates = roc_sorted[:x]
167
+ tp_rates = roc_sorted[:y]
168
+
169
+ pr_auc = area_under_curve(recalls, precisions)
170
+ roc_auc = area_under_curve(fp_rates, tp_rates)
171
+
172
+ { precisions: precisions, recalls: recalls,
173
+ fp_rates: fp_rates, tp_rates: tp_rates,
174
+ pr_auc: pr_auc, roc_auc: roc_auc }
175
+ end
176
+
177
+ # Returns the predictive values hash (TP,FP, TN, FN) for a certain threshold.
178
+ def predictive_values(ground_truth, classification, threshold)
179
+ tp = 0 # vandalism which is classified as vandalism
180
+ fp = 0 # regular that is classified as vandalism
181
+ tn = 0 # regular that is classified as regular
182
+ fn = 0 # vandalism that is classified as regular
183
+
184
+ ground_truth.each do |sample|
185
+ values = sample[1]
186
+ target_class = values[:class]
187
+
188
+ key = :"#{values[:old_revision_id]}-#{values[:new_revision_id]}"
189
+ next unless classification.has_key? key # go on if annotated is not in classification
190
+
191
+ confidence = classification[key][:confidence]
192
+
193
+ tp += 1 if Evaluator.true_positive?(target_class, confidence, threshold) # True Positives
194
+ fn += 1 if Evaluator.false_negative?(target_class, confidence, threshold) # False Negatives
195
+ fp += 1 if Evaluator.false_positive?(target_class, confidence, threshold) # False Positives
196
+ tn += 1 if Evaluator.true_negative?(target_class, confidence, threshold) # True Negatives
197
+ end
198
+
199
+ { tp: tp, fp: fp, tn: tn, fn: fn }
200
+ end
201
+
202
+ # Returns whether the given confidence value represents a true positive (TP) regarding the given target class
203
+ # and threshold.
204
+ def self.true_positive?(target_class, confidence, threshold)
205
+ target_class == Instances::VANDALISM_SHORT && confidence.to_f > threshold.to_f
206
+ end
207
+
208
+ # Returns whether the given confidence value represents a true negative (TN) regarding the given target class
209
+ # and threshold.
210
+ def self.true_negative?(target_class, confidence, threshold)
211
+ target_class == Instances::REGULAR_SHORT && confidence.to_f < threshold.to_f
212
+ end
213
+
214
+ # Returns whether the given confidence value represents a false positive (FP) regarding the given target class
215
+ # and threshold.
216
+ def self.false_positive?(target_class, confidence, threshold)
217
+ target_class == Instances::REGULAR_SHORT && confidence.to_f >= threshold.to_f
218
+ end
219
+
220
+ # Returns whether the given confidence value represents a false negative (FN) regarding the given target class
221
+ # and threshold.
222
+ def self.false_negative?(target_class, confidence, threshold)
223
+ target_class == Instances::VANDALISM_SHORT && confidence.to_f <= threshold.to_f
224
+ end
225
+
226
+ # Returns a hash with performance parameters computed from given TP, FP, TN, FN
227
+ def performance_parameters(tp, fp, tn, fn)
228
+ precision = ((tp + fp == 0) ? 1.0 : (tp.to_f / (tp.to_f + fp.to_f)))
229
+ recall = ((tp + fn == 0) ? 1.0 : (tp.to_f / (tp.to_f + fn.to_f)))
230
+ fp_rate = ((fp + tn == 0) ? 1.0: (fp.to_f / (fp.to_f + tn.to_f)))
231
+
232
+ {
233
+ precision: precision,
234
+ recall: recall,
235
+ fp_rate: fp_rate
236
+ }
237
+ end
238
+
239
+ # Returns the calculated area under curve for given point values
240
+ # x and y values has to be float arrays of the same length.
241
+ def area_under_curve(x_values, y_values)
242
+ raise ArgumentError, 'x and y values must have the same length!' unless x_values.count == y_values.count
243
+
244
+ sum = 0.0
245
+ last_index = x_values.size - 1
246
+
247
+ # trapezoid area formular: A = 1/2 * (b1 + b2) * h
248
+ x_values.each_with_index do |x, index |
249
+ break if index == last_index
250
+
251
+ h = x_values[index + 1] - x
252
+ b1 = y_values[index]
253
+ b2 = y_values[index + 1]
254
+
255
+ sum += 0.5 * (b1 + b2) * h
256
+ end
257
+
258
+ sum.abs
259
+ end
260
+
261
+ # Returns given value array sorted by first array (x_values)
262
+ # Return value is a Hash { x: <x_values_sorted>, y: <y_values_sorted_by_x> }
263
+ # start_value is added in front of arrays if set, e.g. {x: 0.0, y: 1.0}
264
+ # end_values is added to end of arrays if set, e.g. {x: 1.0, y: 1.0 }
265
+ #
266
+ # @example
267
+ # evaluator.sort_curve_values(x, y, { x: 0.0, y: 0.0 }, { x: 1.0, y: 1.0 })
268
+ # #=>Hash { x: [0.0, *x, 1.0], y: [0.0, *y, 1.0] }
269
+ def sort_curve_values(x_values, y_values, start_values = nil, end_values = nil)
270
+ merge_sorted = (x_values.each_with_index.map { |x, index| [x, y_values[index]] })
271
+ merge_sorted = merge_sorted.sort_by{ |values| [values[0], - values[1]] }.uniq
272
+
273
+ x = merge_sorted.transpose[0]
274
+ y = merge_sorted.transpose[1]
275
+
276
+ start_values_set = start_values && (start_values.has_key?(:x) || start_values.has_key?(:y))
277
+ end_values_set = end_values && (end_values.has_key?(:x) || end_values.has_key?(:y))
278
+
279
+ if start_values_set
280
+ unless x.first == start_values[:x] && y.first == start_values[:y]
281
+ x.unshift(start_values[:x] || x.first)
282
+ y.unshift(start_values[:y] || y.first)
283
+ end
284
+ end
285
+
286
+ if end_values_set
287
+ unless x.last == end_values[:x] && y.last == end_values[:y]
288
+ x.push(end_values[:x] || x.last)
289
+ y.push(end_values[:y] || y.last)
290
+ end
291
+ end
292
+
293
+ { x: x, y: y }
294
+ end
295
+
296
+ # Returns the maximum precision recall pair
297
+ def maximum_precision_recall(precisions, recalls)
298
+ areas = precisions.each_with_index.map do |precision, index|
299
+ [precision * recalls[index], index]
300
+ end
301
+
302
+ areas.reject! { |b| !b.all? { |f| !f.to_f.nan? } } # remove arrays with NaN values
303
+ max_index = areas.sort.max[1]
304
+
305
+ { precision: precisions[max_index], recall: recalls[max_index] }
306
+ end
307
+
308
+ # Creates the test corpus text file by classifying the configured test samples
309
+ # All sub steps (as creating the test arff file, etc.) are run automattically if needed.
310
+ def create_testcorpus_classification_file!(file_path, ground_truth_data)
311
+ raise(ArgumentError, "Ground truth data hash is not allowed to be nil!") if ground_truth_data.nil?
312
+
313
+ dataset = TestDataset.build!
314
+
315
+ dir_name = File.dirname(file_path)
316
+ FileUtils.mkdir_p(dir_name) unless Dir.exists?(dir_name)
317
+ file = File.open(file_path, 'w')
318
+
319
+ feature_names = dataset.enumerate_attributes.to_a.map { |attr| attr.name.upcase }[0...-2]
320
+ header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *feature_names].join(' ')
321
+
322
+ file.puts header
323
+
324
+ dataset.to_a2d.each do |instance|
325
+ features = instance[0...-3]
326
+ old_revision_id = instance[-3].to_i
327
+ new_revision_id = instance[-2].to_i
328
+ ground_truth_class_name = Instances::CLASSES_SHORT[Instances::CLASSES.key(instance[-1])]
329
+
330
+ classification = @classifier.classify(features, return_all_params: true)
331
+ class_value = Features::MISSING_VALUE
332
+
333
+ if @config.classifier_type.match(/Functions::LibSVM/) && @config.classifier_options.match(/-s 2/i)
334
+ # LibSVM with one class has only one class during training
335
+ # Vandalism will get class index 0 while classifying
336
+ # Regular will get missing (or Instances::NOT_KNOWN_INDEX in Wikipedia::VandalismDetection::Classifier)
337
+
338
+ if classification[:class_index] == 0
339
+ class_value = 1.0
340
+ elsif classification[:class_index] == Instances::NOT_KNOWN_INDEX
341
+ class_value = 0.0
342
+ end
343
+ else
344
+ if classification[:class_index] == Instances::VANDALISM_CLASS_INDEX
345
+ class_value = 1.0
346
+ elsif classification[:class_index] == Instances::REGULAR_CLASS_INDEX
347
+ class_value = 0.0
348
+ end
349
+ end
350
+
351
+ confidence = classification[:confidence] || class_value
352
+
353
+ must_be_inverted = @config.use_occ? && !!(@classifier.classifier_instance.options =~ /#{Instances::VANDALISM}/)
354
+ confidence_value = must_be_inverted ? (1.0 - confidence) : confidence
355
+ features = features.join(' ').gsub(Float::NAN.to_s, Features::MISSING_VALUE).split
356
+
357
+ file.puts [old_revision_id, new_revision_id, ground_truth_class_name, confidence_value, *features].join(' ')
358
+ end
359
+
360
+ file.close
361
+ end
362
+
363
+ # Returns a hash comprising each feature's predictive values analysis for different thresholds.
364
+ # The Hash structure is the following one:
365
+ # {
366
+ # feature_name_1:
367
+ # {
368
+ # 0.0 => {fp: , fn: , tp: , tn: },
369
+ # ... => {fp: , fn: , tp: , tn: },
370
+ # 1.0 => {fp: , fn: , tp: , tn: }
371
+ # },
372
+ # ...,
373
+ # feature_name_n:
374
+ # {
375
+ # 0.0 => {fp: , fn: , tp: , tn: },
376
+ # ... => {fp: , fn: , tp: , tn: },
377
+ # 1.0 => {fp: , fn: , tp: , tn: }
378
+ # },
379
+ # }
380
+ def feature_analysis(options = {})
381
+ sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
382
+ thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
383
+
384
+ ground_truth_file_path = @config.test_corpus_ground_truth_file
385
+ training_dataset = TrainingDataset.instances
386
+ test_dataset = TestDataset.build!
387
+
388
+ analysis = {}
389
+
390
+ @config.features.each_with_index do |feature_name, index |
391
+ puts "analyzing feature... '#{feature_name}'"
392
+
393
+ dataset = filter_single_attribute(training_dataset, index)
394
+ print ' | train classifier with feature data...'
395
+ classifier = Classifier.new(dataset)
396
+ print "done \n"
397
+
398
+ classification = classification_data(classifier, test_dataset)
399
+ ground_truth = ground_truth_hash(ground_truth_file_path)
400
+
401
+ values = {}
402
+
403
+ thresholds.each do |threshold|
404
+ values[threshold] = predictive_values(ground_truth, classification, threshold)
405
+ end
406
+
407
+ analysis[feature_name] = values
408
+ end
409
+
410
+ analysis
411
+ end
412
+
413
+ # Returns a hash comprising the classifiers predictive values for using all configured features for
414
+ # different thresholds.
415
+ def full_analysis(options = {})
416
+ sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
417
+ thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
418
+
419
+ ground_truth_file_path = @config.test_corpus_ground_truth_file
420
+
421
+ puts 'train classifier...'
422
+ classifier = Classifier.new
423
+
424
+ test_dataset = TestDataset.build!
425
+
426
+ puts 'computing classification...'
427
+ classification = classification_data(classifier, test_dataset)
428
+ ground_truth = ground_truth_hash(ground_truth_file_path)
429
+
430
+ analysis = {}
431
+
432
+ thresholds.each do |threshold|
433
+ analysis[threshold] = predictive_values(ground_truth, classification, threshold)
434
+ end
435
+
436
+ print "done \n"
437
+ analysis
438
+ end
439
+
440
+ private
441
+
442
+ # Returns a dataset only holding the attribute at the given index.
443
+ # Weka Unsupervised Attribute Remove filter is used.
444
+ def filter_single_attribute(dataset, attribute_index)
445
+ filter = Weka::Filters::Unsupervised::Attribute::Remove.new
446
+
447
+ filter.set do
448
+ data dataset
449
+ filter_options "-V -R #{attribute_index + 1},#{dataset.class_index + 1}"
450
+ end
451
+
452
+ filtered = filter.use
453
+ filtered.class_index = filtered.n_col - 1
454
+ filtered
455
+ end
456
+
457
+ # Returns an array of classification confidences of the test corpus' classification with the given classifier
458
+ def classification_data(classifier, test_dataset)
459
+ classification = {}
460
+
461
+ test_dataset.to_a2d.each do |instance|
462
+ features = instance[0...-3]
463
+
464
+ old_revision_id = instance[-3].to_i
465
+ new_revision_id = instance[-2].to_i
466
+
467
+ params = classifier.classify(features, return_all_params: true)
468
+ class_short_name = Instances::CLASSES_SHORT[params[:class_index]]
469
+
470
+ must_be_inverted = @config.use_occ? && !(@classifier.classifier_instance.options =~ /#{Instances::VANDALISM}/)
471
+ confidence = must_be_inverted ? (1.0 - params[:confidence]) : params[:confidence]
472
+
473
+ classification[:"#{old_revision_id}-#{new_revision_id}"] = {
474
+ old_revision_id: old_revision_id,
475
+ new_revision_id: new_revision_id,
476
+ class: class_short_name,
477
+ confidence: confidence
478
+ }
479
+ end
480
+
481
+ classification
482
+ end
483
+
484
+ # Returns a hash for classification data from given classification file
485
+ def classification_hash(classification_file)
486
+ file = File.read(classification_file)
487
+ classification_samples = file.lines.to_a
488
+ classification_samples.shift # remove header line
489
+
490
+ classification = {}
491
+
492
+ classification_samples.each do |line|
493
+ line_parts = line.split(' ')
494
+
495
+ old_revision_id = line_parts[0].to_i
496
+ new_revision_id = line_parts[1].to_i
497
+ class_short = line_parts[2]
498
+ confidence = line_parts[3].to_f
499
+
500
+ classification[:"#{old_revision_id}-#{new_revision_id}"] = {
501
+ old_revision_id: old_revision_id,
502
+ new_revision_id: new_revision_id,
503
+ class: class_short,
504
+ confidence: confidence
505
+ }
506
+ end
507
+
508
+ classification
509
+ end
510
+
511
+ # Returns a hash for classification data from given ground truth file
512
+ def ground_truth_hash(ground_truth_file)
513
+ file = File.read(ground_truth_file)
514
+ ground_truth_samples = file.lines.to_a
515
+
516
+ ground_truth = {}
517
+
518
+ ground_truth_samples.each do |line|
519
+ line_parts = line.split(' ')
520
+
521
+ old_revision_id = line_parts[0].to_i
522
+ new_revision_id = line_parts[1].to_i
523
+ class_short = line_parts[2]
524
+
525
+ ground_truth[:"#{old_revision_id}-#{new_revision_id}"] = {
526
+ old_revision_id: old_revision_id,
527
+ new_revision_id: new_revision_id,
528
+ class: class_short
529
+ }
530
+ end
531
+
532
+ ground_truth
533
+ end
534
+
535
+ # Cross validates classifier over full dataset with <fold>-fold cross validation
536
+ def cross_validate_all_instances(fold)
537
+ begin
538
+ @classifier_instance.cross_validate(fold)
539
+ rescue => e
540
+ raise "Error while cross validation: #{e}"
541
+ end
542
+ end
543
+
544
+ # Cross validates classifier over equally distributed dataset with <fold>-fold cross validation
545
+ def cross_validate_equally_distributed(fold)
546
+ dirname = @config.output_base_directory
547
+ FileUtils.mkdir(dirname) unless Dir.exists?(dirname)
548
+
549
+ file_name = 'cross_validation_eq_distr.txt'
550
+ file_path = File.join(dirname, file_name)
551
+
552
+ puts "Writing to #{file_path}..."
553
+ result_file = File.open(file_path, 'a')
554
+
555
+ begin
556
+ time = Time.now.strftime("%Y-%m-%d %H:%M")
557
+ type = @config.classifier_type
558
+ options = @config.classifier_options || "default"
559
+ result_file.puts "\nCROSS VALIDATION - #{fold} fold (Classifier: #{type}, options: #{options} ) | #{time}"
560
+ result_file.puts "Features: \n\t#{@config.features.join("\n\t")}\n\n"
561
+
562
+ evaluations = []
563
+
564
+ times = 10
565
+
566
+ # run n times validation
567
+ (1..times).each do |i|
568
+ uniform_dataset = TrainingDataset.balanced_instances
569
+
570
+ print "\rcross validate dataset (equally distributed) ... #{i}/#{times} | instances: #{uniform_dataset.n_rows}"
571
+ @classifier_instance.set_data(uniform_dataset)
572
+ evaluations << @classifier_instance.cross_validate(fold)
573
+
574
+ print_evaluation_data(evaluations, result_file, i) if (i % (times / 10)) == 0
575
+ end
576
+
577
+ #evaluation_data_of(evaluations)
578
+ evaluations
579
+ rescue => e
580
+ raise "Error while cross validation for equally distributed instances: #{e}"
581
+ ensure
582
+ result_file.close
583
+ puts "\nThe evaluation results has been saved to #{file_path}"
584
+ end
585
+ end
586
+
587
+ # Returns the evaluation data average value hash of the given evaluations.
588
+ def evaluation_data_of(evaluations)
589
+ class_index = Instances::VANDALISM_CLASS_INDEX
590
+ total_count = evaluations.count.to_f
591
+
592
+ recall = evaluations.reduce(0.0) { |result, sample| result + sample.recall(class_index) } / total_count
593
+ precision = evaluations.reduce(0.0) { |result, sample| result + sample.precision(class_index) } / total_count
594
+ area_under_prc = evaluations.reduce(0.0) { |result, sample| result + sample.area_under_prc(class_index) } / total_count
595
+
596
+ { precision: precision, recall: recall, area_under_prc: area_under_prc }
597
+ end
598
+
599
+ # Prints data to file
600
+ def print_evaluation_data(evaluations, file, index)
601
+ data = evaluation_data_of(evaluations)
602
+ file.puts "#{index}\tprecision: #{data[:precision]} | recall: #{data[:recall]} | Area under PRC: #{data[:area_under_prc]}"
603
+ end
604
+ end
605
+ end
606
+ end