wikipedia-vandalism_detection 0.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
@@ -0,0 +1,640 @@
|
|
1
|
+
require 'wikipedia/vandalism_detection/configuration'
|
2
|
+
require 'wikipedia/vandalism_detection/exceptions'
|
3
|
+
require 'wikipedia/vandalism_detection/training_dataset'
|
4
|
+
require 'wikipedia/vandalism_detection/test_dataset'
|
5
|
+
require 'wikipedia/vandalism_detection/classifier'
|
6
|
+
require 'wikipedia/vandalism_detection/instances'
|
7
|
+
require 'weka'
|
8
|
+
require 'fileutils'
|
9
|
+
require 'csv'
|
10
|
+
|
11
|
+
module Wikipedia
|
12
|
+
module VandalismDetection
|
13
|
+
# This class provides methods for the evaluation of a
|
14
|
+
# Wikipedia::VandalismDetection::Classifier using the weka framwork.
|
15
|
+
#
|
16
|
+
# @example
|
17
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
18
|
+
# evaluator = Wikipedia::VandalsimDetection::Evaluator(classifier)
|
19
|
+
#
|
20
|
+
# evaluation = evaluator.cross_validate
|
21
|
+
# evaluation = evaluator.cross_validate(equally_distributed: true)
|
22
|
+
#
|
23
|
+
# puts evaluation[:precision]
|
24
|
+
# puts evaluation[:recall]
|
25
|
+
# puts evaluation[:area_under_prc]
|
26
|
+
class Evaluator
|
27
|
+
DEFAULT_SAMPLE_COUNT = 200
|
28
|
+
DEFAULTS = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS
|
29
|
+
|
30
|
+
def initialize(classifier)
|
31
|
+
unless classifier.is_a?(Wikipedia::VandalismDetection::Classifier)
|
32
|
+
message = 'The classifier argument has to be an instance of ' \
|
33
|
+
'Wikipedia::VandalismDetection::Classifier'
|
34
|
+
raise ArgumentError, message
|
35
|
+
end
|
36
|
+
|
37
|
+
@config = Wikipedia::VandalismDetection.config
|
38
|
+
@classifier = classifier
|
39
|
+
@classifier_instance = classifier.classifier_instance
|
40
|
+
end
|
41
|
+
|
42
|
+
# Cross validates the classifier.
|
43
|
+
# Fold is used as defined in configuration (default is 10).
|
44
|
+
#
|
45
|
+
# @example
|
46
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
47
|
+
# evaluation = classifier.cross_validate
|
48
|
+
# evaluation = classifier.cross_validate(equally_distributed: true)
|
49
|
+
#
|
50
|
+
def cross_validate(options = {})
|
51
|
+
equally_distributed = options[:equally_distributed]
|
52
|
+
|
53
|
+
fold_defaults = DEFAULTS['classifier']['cross-validation-fold']
|
54
|
+
fold = @config.cross_validation_fold || fold_defaults
|
55
|
+
|
56
|
+
if equally_distributed
|
57
|
+
cross_validate_equally_distributed(fold)
|
58
|
+
else
|
59
|
+
cross_validate_all_instances(fold)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Returns a Hash comprising the evaluation curve data Arrays for precision, recall
|
64
|
+
#
|
65
|
+
# @example
|
66
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
67
|
+
# evaluator = classifier.evaluator
|
68
|
+
# or
|
69
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
70
|
+
# evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
71
|
+
#
|
72
|
+
# curve_data = evaluator.curve_data
|
73
|
+
#
|
74
|
+
# curve_data[:precision]
|
75
|
+
# # => [0.76, ..., 0.91]
|
76
|
+
#
|
77
|
+
# curve_data[:recall]
|
78
|
+
# # => [0.87, ..., 0.89]
|
79
|
+
#
|
80
|
+
# curve_data[:area_under_prc]
|
81
|
+
# # => 0.83
|
82
|
+
def curve_data(options = {})
|
83
|
+
evaluations = cross_validate(options)
|
84
|
+
threshold_curve = Weka::Classifiers::Evaluation::ThresholdCurve.new
|
85
|
+
|
86
|
+
evaluation_data = evaluations.is_a?(Array) ? evaluations[0] : evaluations
|
87
|
+
|
88
|
+
instances = threshold_curve.curve(
|
89
|
+
evaluation_data.predictions,
|
90
|
+
Instances::VANDALISM_CLASS_INDEX
|
91
|
+
)
|
92
|
+
|
93
|
+
precision = instances.attribute_to_double_array(0).to_a
|
94
|
+
recall = instances.attribute_to_double_array(1).to_a
|
95
|
+
area_under_prc = evaluation_data.area_under_prc(Instances::VANDALISM_CLASS_INDEX)
|
96
|
+
|
97
|
+
{
|
98
|
+
precision: precision,
|
99
|
+
recall: recall,
|
100
|
+
area_under_prc: area_under_prc
|
101
|
+
}
|
102
|
+
end
|
103
|
+
|
104
|
+
# Evaluates the classification of the configured test corpus against the
|
105
|
+
# given ground truth.
|
106
|
+
# Runs the file creation automatically unless the classification file
|
107
|
+
# exists, yet.
|
108
|
+
#
|
109
|
+
# Number of samples to use can be set by 'sample_count: <number>'
|
110
|
+
# option. Default number of samples is 100.
|
111
|
+
#
|
112
|
+
# Returns a Hash with values:
|
113
|
+
# :recalls - recall values
|
114
|
+
# :precisions - precision values
|
115
|
+
# :fp_rates - fals positive rate values
|
116
|
+
# :auprc - area under precision recall curve
|
117
|
+
# :auroc - area under receiver operator curve
|
118
|
+
# :total_recall - overall classifier recall value
|
119
|
+
# :total_precision - overall classifier precision value
|
120
|
+
#
|
121
|
+
# @example
|
122
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
123
|
+
# evaluator = classifier.evaluator
|
124
|
+
# or
|
125
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
126
|
+
# evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
127
|
+
#
|
128
|
+
# evaluator.evaluate_testcorpus_classification
|
129
|
+
# evaluator.evaluate_testcorpus_classification(sample_count: 50)
|
130
|
+
#
|
131
|
+
def evaluate_testcorpus_classification(options = {})
|
132
|
+
ground_truth_file_path = @config.test_corpus_ground_truth_file
|
133
|
+
|
134
|
+
unless ground_truth_file_path
|
135
|
+
message = 'Ground truth file path has to be set for test set evaluation'
|
136
|
+
raise GroundTruthFileNotConfiguredError, message
|
137
|
+
end
|
138
|
+
|
139
|
+
unless File.exist?(ground_truth_file_path)
|
140
|
+
message = 'Configured ground truth file is not available.'
|
141
|
+
raise GroundTruthFileNotFoundError, message
|
142
|
+
end
|
143
|
+
|
144
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
145
|
+
create_testcorpus_classification_file!(@config.test_output_classification_file, ground_truth)
|
146
|
+
classification = classification_hash(@config.test_output_classification_file)
|
147
|
+
|
148
|
+
sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
|
149
|
+
curves = test_performance_curves(ground_truth, classification, sample_count)
|
150
|
+
precision_recall = maximum_precision_recall(curves[:precisions], curves[:recalls])
|
151
|
+
|
152
|
+
curves[:total_recall] = precision_recall[:recall]
|
153
|
+
curves[:total_precision] = precision_recall[:precision]
|
154
|
+
|
155
|
+
curves
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns the performance curve points (recall, precision, fp-rate) and
|
159
|
+
# computed area under curves.
|
160
|
+
def test_performance_curves(ground_truth, classification, sample_count)
|
161
|
+
thresholds = (0.0...1.0).step(1.0 / sample_count.to_f).to_a
|
162
|
+
|
163
|
+
# remove first value to not use the [0,1] value in curve
|
164
|
+
thresholds.shift
|
165
|
+
|
166
|
+
precisions = []
|
167
|
+
recalls = []
|
168
|
+
fp_rates = []
|
169
|
+
|
170
|
+
thresholds.each do |threshold|
|
171
|
+
values = predictive_values(ground_truth, classification, threshold)
|
172
|
+
performance_params = performance_parameters(
|
173
|
+
values[:tp],
|
174
|
+
values[:fp],
|
175
|
+
values[:tn],
|
176
|
+
values[:fn]
|
177
|
+
)
|
178
|
+
|
179
|
+
precisions.push performance_params[:precision]
|
180
|
+
recalls.push performance_params[:recall]
|
181
|
+
fp_rates.push performance_params[:fp_rate]
|
182
|
+
end
|
183
|
+
|
184
|
+
tp_rates = recalls
|
185
|
+
pr_sorted = sort_curve_values(recalls, precisions, x: 0.0, y: 0.0)
|
186
|
+
roc_sorted = sort_curve_values(fp_rates, tp_rates, y: 0.0, x: 1.0)
|
187
|
+
|
188
|
+
recalls = pr_sorted[:x]
|
189
|
+
precisions = pr_sorted[:y]
|
190
|
+
fp_rates = roc_sorted[:x]
|
191
|
+
tp_rates = roc_sorted[:y]
|
192
|
+
|
193
|
+
pr_auc = area_under_curve(recalls, precisions)
|
194
|
+
roc_auc = area_under_curve(fp_rates, tp_rates)
|
195
|
+
|
196
|
+
{
|
197
|
+
precisions: precisions, recalls: recalls,
|
198
|
+
fp_rates: fp_rates, tp_rates: tp_rates,
|
199
|
+
pr_auc: pr_auc, roc_auc: roc_auc
|
200
|
+
}
|
201
|
+
end
|
202
|
+
|
203
|
+
# Returns the predictive values hash (TP,FP, TN, FN) for a certain
|
204
|
+
# threshold.
|
205
|
+
def predictive_values(ground_truth, classification, threshold)
|
206
|
+
tp = 0 # vandalism which is classified as vandalism
|
207
|
+
fp = 0 # regular that is classified as vandalism
|
208
|
+
tn = 0 # regular that is classified as regular
|
209
|
+
fn = 0 # vandalism that is classified as regular
|
210
|
+
|
211
|
+
ground_truth.each do |sample|
|
212
|
+
values = sample[1]
|
213
|
+
target_class = values[:class]
|
214
|
+
|
215
|
+
key = :"#{values[:old_revision_id]}-#{values[:new_revision_id]}"
|
216
|
+
# go on if annotated is not in classification
|
217
|
+
next unless classification.key?(key)
|
218
|
+
|
219
|
+
confidence = classification[key][:confidence]
|
220
|
+
|
221
|
+
tp += 1 if Evaluator.true_positive?(target_class, confidence, threshold) # True Positives
|
222
|
+
fn += 1 if Evaluator.false_negative?(target_class, confidence, threshold) # False Negatives
|
223
|
+
fp += 1 if Evaluator.false_positive?(target_class, confidence, threshold) # False Positives
|
224
|
+
tn += 1 if Evaluator.true_negative?(target_class, confidence, threshold) # True Negatives
|
225
|
+
end
|
226
|
+
|
227
|
+
{ tp: tp, fp: fp, tn: tn, fn: fn }
|
228
|
+
end
|
229
|
+
|
230
|
+
# Returns whether the given confidence value represents a
|
231
|
+
# true positive (TP) regarding the given target class and threshold.
|
232
|
+
def self.true_positive?(target_class, confidence, threshold)
|
233
|
+
target_class == Instances::VANDALISM_SHORT && confidence.to_f > threshold.to_f
|
234
|
+
end
|
235
|
+
|
236
|
+
# Returns whether the given confidence value represents a
|
237
|
+
# true negative (TN) regarding the given target class and threshold.
|
238
|
+
def self.true_negative?(target_class, confidence, threshold)
|
239
|
+
target_class == Instances::REGULAR_SHORT && confidence.to_f < threshold.to_f
|
240
|
+
end
|
241
|
+
|
242
|
+
# Returns whether the given confidence value represents a
|
243
|
+
# false positive (FP) regarding the given target class and threshold.
|
244
|
+
def self.false_positive?(target_class, confidence, threshold)
|
245
|
+
target_class == Instances::REGULAR_SHORT && confidence.to_f >= threshold.to_f
|
246
|
+
end
|
247
|
+
|
248
|
+
# Returns whether the given confidence value represents a
|
249
|
+
# false negative (FN) regarding the given target class and threshold.
|
250
|
+
def self.false_negative?(target_class, confidence, threshold)
|
251
|
+
target_class == Instances::VANDALISM_SHORT && confidence.to_f <= threshold.to_f
|
252
|
+
end
|
253
|
+
|
254
|
+
# Returns a hash with performance parameters computed from given
|
255
|
+
# TP, FP, TN, FN
|
256
|
+
def performance_parameters(tp, fp, tn, fn)
|
257
|
+
precision = (tp + fp).zero? ? 1.0 : tp.to_f / (tp.to_f + fp.to_f)
|
258
|
+
recall = (tp + fn).zero? ? 1.0 : tp.to_f / (tp.to_f + fn.to_f)
|
259
|
+
fp_rate = (fp + tn).zero? ? 1.0 : fp.to_f / (fp.to_f + tn.to_f)
|
260
|
+
|
261
|
+
{
|
262
|
+
precision: precision,
|
263
|
+
recall: recall,
|
264
|
+
fp_rate: fp_rate
|
265
|
+
}
|
266
|
+
end
|
267
|
+
|
268
|
+
# Returns the calculated area under curve for given point values
|
269
|
+
# x and y values has to be float arrays of the same length.
|
270
|
+
def area_under_curve(x_values, y_values)
|
271
|
+
unless x_values.count == y_values.count
|
272
|
+
raise ArgumentError, 'x and y values must have the same length!'
|
273
|
+
end
|
274
|
+
|
275
|
+
sum = 0.0
|
276
|
+
last_index = x_values.size - 1
|
277
|
+
|
278
|
+
# trapezoid area formular: A = 1/2 * (b1 + b2) * h
|
279
|
+
x_values.each_with_index do |x, index|
|
280
|
+
break if index == last_index
|
281
|
+
|
282
|
+
h = x_values[index + 1] - x
|
283
|
+
b1 = y_values[index]
|
284
|
+
b2 = y_values[index + 1]
|
285
|
+
|
286
|
+
sum += 0.5 * (b1 + b2) * h
|
287
|
+
end
|
288
|
+
|
289
|
+
sum.abs
|
290
|
+
end
|
291
|
+
|
292
|
+
# Returns given value array sorted by first array (x_values)
|
293
|
+
# Return value is a Hash { x: <x_values_sorted>, y: <y_values_sorted_by_x> }
|
294
|
+
# start_value is added in front of arrays if set, e.g. {x: 0.0, y: 1.0}
|
295
|
+
# end_values is added to end of arrays if set, e.g. {x: 1.0, y: 1.0 }
|
296
|
+
#
|
297
|
+
# @example
|
298
|
+
# evaluator.sort_curve_values(x, y, { x: 0.0, y: 0.0 }, { x: 1.0, y: 1.0 })
|
299
|
+
# #=>Hash { x: [0.0, *x, 1.0], y: [0.0, *y, 1.0] }
|
300
|
+
def sort_curve_values(x_values, y_values, start_values = nil, end_values = nil)
|
301
|
+
merge_sorted = x_values.each_with_index.map { |x, index| [x, y_values[index]] }
|
302
|
+
merge_sorted = merge_sorted.sort_by { |values| [values[0], - values[1]] }.uniq
|
303
|
+
|
304
|
+
x = merge_sorted.transpose[0]
|
305
|
+
y = merge_sorted.transpose[1]
|
306
|
+
|
307
|
+
start_values_set = start_values && (start_values.key?(:x) || start_values.key?(:y))
|
308
|
+
end_values_set = end_values && (end_values.key?(:x) || end_values.key?(:y))
|
309
|
+
|
310
|
+
if start_values_set
|
311
|
+
unless x.first == start_values[:x] && y.first == start_values[:y]
|
312
|
+
x.unshift(start_values[:x] || x.first)
|
313
|
+
y.unshift(start_values[:y] || y.first)
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
317
|
+
if end_values_set
|
318
|
+
unless x.last == end_values[:x] && y.last == end_values[:y]
|
319
|
+
x.push(end_values[:x] || x.last)
|
320
|
+
y.push(end_values[:y] || y.last)
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
{ x: x, y: y }
|
325
|
+
end
|
326
|
+
|
327
|
+
# Returns the maximum precision recall pair
|
328
|
+
def maximum_precision_recall(precisions, recalls)
|
329
|
+
areas = precisions.each_with_index.map do |precision, index|
|
330
|
+
[precision * recalls[index], index]
|
331
|
+
end
|
332
|
+
|
333
|
+
# remove arrays with NaN values
|
334
|
+
areas.select! { |b| b.all? { |f| !f.to_f.nan? } }
|
335
|
+
max_index = areas.sort.max[1]
|
336
|
+
|
337
|
+
{ precision: precisions[max_index], recall: recalls[max_index] }
|
338
|
+
end
|
339
|
+
|
340
|
+
# Creates the test corpus text file by classifying the configured test
|
341
|
+
# samples. All sub steps (as creating the test arff file, etc.) are run
|
342
|
+
# automatically if needed.
|
343
|
+
def create_testcorpus_classification_file!(file_path, ground_truth_data)
|
344
|
+
if ground_truth_data.nil?
|
345
|
+
raise ArgumentError, 'Ground truth data hash is not allowed to be nil'
|
346
|
+
end
|
347
|
+
|
348
|
+
dataset = TestDataset.build!
|
349
|
+
|
350
|
+
dir_name = File.dirname(file_path)
|
351
|
+
FileUtils.mkdir_p(dir_name) unless Dir.exist?(dir_name)
|
352
|
+
file = File.open(file_path, 'w')
|
353
|
+
|
354
|
+
feature_names = dataset.attribute_names.map(&:upcase)[0...-2]
|
355
|
+
header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *feature_names].join(' ')
|
356
|
+
|
357
|
+
file.puts header
|
358
|
+
|
359
|
+
dataset.to_m.to_a.each do |instance|
|
360
|
+
features = instance[0...-3]
|
361
|
+
old_revision_id = instance[-3].to_i
|
362
|
+
new_revision_id = instance[-2].to_i
|
363
|
+
ground_truth_class_name = Instances::CLASSES_SHORT[Instances::CLASSES.key(instance[-1])]
|
364
|
+
|
365
|
+
classification = @classifier.classify(features, return_all_params: true)
|
366
|
+
|
367
|
+
if classification[:class_index] == Instances::VANDALISM_CLASS_INDEX
|
368
|
+
class_value = 1.0
|
369
|
+
elsif classification[:class_index] == Instances::REGULAR_CLASS_INDEX
|
370
|
+
class_value = 0.0
|
371
|
+
else
|
372
|
+
class_value = Features::MISSING_VALUE
|
373
|
+
end
|
374
|
+
|
375
|
+
confidence = classification[:confidence] || class_value
|
376
|
+
|
377
|
+
must_be_inverted = @config.use_occ? && !!(@classifier.classifier_instance.options =~ /#{Instances::VANDALISM}/)
|
378
|
+
confidence_value = must_be_inverted ? 1.0 - confidence : confidence
|
379
|
+
features = features.join(' ').gsub(Float::NAN.to_s, Features::MISSING_VALUE).split
|
380
|
+
|
381
|
+
file.puts [
|
382
|
+
old_revision_id,
|
383
|
+
new_revision_id,
|
384
|
+
ground_truth_class_name,
|
385
|
+
confidence_value,
|
386
|
+
*features
|
387
|
+
].join(' ')
|
388
|
+
end
|
389
|
+
|
390
|
+
file.close
|
391
|
+
end
|
392
|
+
|
393
|
+
# Returns a hash comprising each feature's predictive values analysis for
|
394
|
+
# different thresholds.
|
395
|
+
# The Hash structure is the following one:
|
396
|
+
# {
|
397
|
+
# feature_name_1:
|
398
|
+
# {
|
399
|
+
# 0.0 => {fp: , fn: , tp: , tn: },
|
400
|
+
# ... => {fp: , fn: , tp: , tn: },
|
401
|
+
# 1.0 => {fp: , fn: , tp: , tn: }
|
402
|
+
# },
|
403
|
+
# ...,
|
404
|
+
# feature_name_n:
|
405
|
+
# {
|
406
|
+
# 0.0 => {fp: , fn: , tp: , tn: },
|
407
|
+
# ... => {fp: , fn: , tp: , tn: },
|
408
|
+
# 1.0 => {fp: , fn: , tp: , tn: }
|
409
|
+
# },
|
410
|
+
# }
|
411
|
+
def feature_analysis(options = {})
|
412
|
+
sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
|
413
|
+
thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
|
414
|
+
|
415
|
+
ground_truth_file_path = @config.test_corpus_ground_truth_file
|
416
|
+
training_dataset = TrainingDataset.instances
|
417
|
+
test_dataset = TestDataset.build!
|
418
|
+
|
419
|
+
analysis = {}
|
420
|
+
|
421
|
+
@config.features.each_with_index do |feature_name, index|
|
422
|
+
puts "analyzing feature… '#{feature_name}'"
|
423
|
+
|
424
|
+
dataset = filter_single_attribute(training_dataset, index)
|
425
|
+
print ' | train classifier with feature data…'
|
426
|
+
classifier = Classifier.new(dataset)
|
427
|
+
print "done \n"
|
428
|
+
|
429
|
+
classification = classification_data(classifier, test_dataset)
|
430
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
431
|
+
|
432
|
+
values = {}
|
433
|
+
|
434
|
+
thresholds.each do |threshold|
|
435
|
+
values[threshold] = predictive_values(ground_truth, classification, threshold)
|
436
|
+
end
|
437
|
+
|
438
|
+
analysis[feature_name] = values
|
439
|
+
end
|
440
|
+
|
441
|
+
analysis
|
442
|
+
end
|
443
|
+
|
444
|
+
# Returns a hash comprising the classifiers predictive values for using
|
445
|
+
# all configured features for different thresholds.
|
446
|
+
def full_analysis(options = {})
|
447
|
+
sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
|
448
|
+
thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
|
449
|
+
|
450
|
+
ground_truth_file_path = @config.test_corpus_ground_truth_file
|
451
|
+
|
452
|
+
puts 'train classifier…'
|
453
|
+
classifier = Classifier.new
|
454
|
+
|
455
|
+
test_dataset = TestDataset.build!
|
456
|
+
|
457
|
+
puts 'computing classification…'
|
458
|
+
classification = classification_data(classifier, test_dataset)
|
459
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
460
|
+
|
461
|
+
analysis = {}
|
462
|
+
|
463
|
+
thresholds.each do |threshold|
|
464
|
+
analysis[threshold] = predictive_values(ground_truth, classification, threshold)
|
465
|
+
end
|
466
|
+
|
467
|
+
print "done\n"
|
468
|
+
analysis
|
469
|
+
end
|
470
|
+
|
471
|
+
private
|
472
|
+
|
473
|
+
# Returns a dataset only holding the attribute at the given index.
|
474
|
+
# Weka Unsupervised Attribute Remove filter is used.
|
475
|
+
def filter_single_attribute(dataset, attribute_index)
|
476
|
+
filter = Weka::Filters::Unsupervised::Attribute::Remove.new
|
477
|
+
filter.use_options("-V -R #{attribute_index + 1},#{dataset.class_index + 1}")
|
478
|
+
|
479
|
+
filtered = filter.filter(dataset)
|
480
|
+
filtered.class_index = filtered.attributes_count - 1
|
481
|
+
filtered
|
482
|
+
end
|
483
|
+
|
484
|
+
# Returns an array of classification confidences of the test corpus'
|
485
|
+
# classification with the given classifier
|
486
|
+
def classification_data(classifier, test_dataset)
|
487
|
+
classification = {}
|
488
|
+
|
489
|
+
test_dataset.to_m.to_a.each do |instance|
|
490
|
+
features = instance[0...-3]
|
491
|
+
|
492
|
+
old_revision_id = instance[-3].to_i
|
493
|
+
new_revision_id = instance[-2].to_i
|
494
|
+
|
495
|
+
params = classifier.classify(features, return_all_params: true)
|
496
|
+
class_short_name = Instances::CLASSES_SHORT[params[:class_index]]
|
497
|
+
|
498
|
+
must_be_inverted = @config.use_occ? && @classifier.classifier_instance.options !~ /#{Instances::VANDALISM}/
|
499
|
+
confidence = must_be_inverted ? 1.0 - params[:confidence] : params[:confidence]
|
500
|
+
|
501
|
+
classification[:"#{old_revision_id}-#{new_revision_id}"] = {
|
502
|
+
old_revision_id: old_revision_id,
|
503
|
+
new_revision_id: new_revision_id,
|
504
|
+
class: class_short_name,
|
505
|
+
confidence: confidence
|
506
|
+
}
|
507
|
+
end
|
508
|
+
|
509
|
+
classification
|
510
|
+
end
|
511
|
+
|
512
|
+
# Returns a hash for classification data from given classification file
|
513
|
+
def classification_hash(classification_file)
|
514
|
+
file = File.read(classification_file)
|
515
|
+
classification_samples = file.lines.to_a
|
516
|
+
classification_samples.shift # remove header line
|
517
|
+
|
518
|
+
classification = {}
|
519
|
+
|
520
|
+
classification_samples.each do |line|
|
521
|
+
line_parts = line.split(' ')
|
522
|
+
|
523
|
+
old_revision_id = line_parts[0].to_i
|
524
|
+
new_revision_id = line_parts[1].to_i
|
525
|
+
class_short = line_parts[2]
|
526
|
+
confidence = line_parts[3].to_f
|
527
|
+
|
528
|
+
classification[:"#{old_revision_id}-#{new_revision_id}"] = {
|
529
|
+
old_revision_id: old_revision_id,
|
530
|
+
new_revision_id: new_revision_id,
|
531
|
+
class: class_short,
|
532
|
+
confidence: confidence
|
533
|
+
}
|
534
|
+
end
|
535
|
+
|
536
|
+
classification
|
537
|
+
end
|
538
|
+
|
539
|
+
# Returns a hash for classification data from given ground truth file
|
540
|
+
def ground_truth_hash(ground_truth_file)
|
541
|
+
file = File.read(ground_truth_file)
|
542
|
+
ground_truth_samples = file.lines.to_a
|
543
|
+
|
544
|
+
ground_truth = {}
|
545
|
+
|
546
|
+
ground_truth_samples.each do |line|
|
547
|
+
line_parts = line.split(' ')
|
548
|
+
|
549
|
+
old_revision_id = line_parts[0].to_i
|
550
|
+
new_revision_id = line_parts[1].to_i
|
551
|
+
class_short = line_parts[2]
|
552
|
+
|
553
|
+
ground_truth[:"#{old_revision_id}-#{new_revision_id}"] = {
|
554
|
+
old_revision_id: old_revision_id,
|
555
|
+
new_revision_id: new_revision_id,
|
556
|
+
class: class_short
|
557
|
+
}
|
558
|
+
end
|
559
|
+
|
560
|
+
ground_truth
|
561
|
+
end
|
562
|
+
|
563
|
+
# Cross validates classifier over full dataset with <fold>-fold cross
|
564
|
+
# validation
|
565
|
+
def cross_validate_all_instances(fold)
|
566
|
+
@classifier_instance.cross_validate(folds: fold)
|
567
|
+
rescue => error
|
568
|
+
raise "Error while cross validation: #{error}"
|
569
|
+
end
|
570
|
+
|
571
|
+
# Cross validates classifier over equally distributed dataset with
|
572
|
+
# <fold>-fold cross validation
|
573
|
+
def cross_validate_equally_distributed(fold)
|
574
|
+
dirname = @config.output_base_directory
|
575
|
+
FileUtils.mkdir(dirname) unless Dir.exist?(dirname)
|
576
|
+
|
577
|
+
file_name = 'cross_validation_eq_distr.txt'
|
578
|
+
file_path = File.join(dirname, file_name)
|
579
|
+
|
580
|
+
puts "Writing to #{file_path}…"
|
581
|
+
result_file = File.open(file_path, 'a')
|
582
|
+
|
583
|
+
begin
|
584
|
+
time = Time.now.strftime('%Y-%m-%d %H:%M')
|
585
|
+
type = @config.classifier_type
|
586
|
+
options = @config.classifier_options || 'default'
|
587
|
+
result_file.puts "\nCROSS VALIDATION - #{fold} fold (Classifier: #{type}, options: #{options} ) | #{time}"
|
588
|
+
result_file.puts "Features: \n\t#{@config.features.join("\n\t")}\n\n"
|
589
|
+
|
590
|
+
evaluations = []
|
591
|
+
|
592
|
+
times = 10
|
593
|
+
|
594
|
+
# run n times validation
|
595
|
+
(1..times).each do |i|
|
596
|
+
uniform_dataset = TrainingDataset.balanced_instances
|
597
|
+
|
598
|
+
print "\rcross validate dataset (equally distributed)… #{i}/#{times} | instances: #{uniform_dataset.size}"
|
599
|
+
@classifier_instance.train_with_instances(uniform_dataset)
|
600
|
+
evaluations << @classifier_instance.cross_validate(folds: fold)
|
601
|
+
|
602
|
+
if (i % (times / 10)).zero?
|
603
|
+
print_evaluation_data(evaluations, result_file, i)
|
604
|
+
end
|
605
|
+
end
|
606
|
+
|
607
|
+
#evaluation_data_of(evaluations)
|
608
|
+
evaluations
|
609
|
+
rescue => error
|
610
|
+
raise "Error while cross validation for equally distributed instances: #{error}"
|
611
|
+
ensure
|
612
|
+
result_file.close
|
613
|
+
puts "\nThe evaluation results has been saved to #{file_path}"
|
614
|
+
end
|
615
|
+
end
|
616
|
+
|
617
|
+
# Returns the evaluation data average value hash of the given evaluations.
|
618
|
+
def evaluation_data_of(evaluations)
|
619
|
+
class_index = Instances::VANDALISM_CLASS_INDEX
|
620
|
+
total_count = evaluations.count.to_f
|
621
|
+
|
622
|
+
recall = evaluations.reduce(0.0) { |result, sample| result + sample.recall(class_index) } / total_count
|
623
|
+
precision = evaluations.reduce(0.0) { |result, sample| result + sample.precision(class_index) } / total_count
|
624
|
+
area_under_prc = evaluations.reduce(0.0) { |result, sample| result + sample.area_under_prc(class_index) } / total_count
|
625
|
+
|
626
|
+
{
|
627
|
+
precision: precision,
|
628
|
+
recall: recall,
|
629
|
+
area_under_prc: area_under_prc
|
630
|
+
}
|
631
|
+
end
|
632
|
+
|
633
|
+
# Prints data to file
|
634
|
+
def print_evaluation_data(evaluations, file, index)
|
635
|
+
data = evaluation_data_of(evaluations)
|
636
|
+
file.puts "#{index}\tprecision: #{data[:precision]} | recall: #{data[:recall]} | Area under PRC: #{data[:area_under_prc]}"
|
637
|
+
end
|
638
|
+
end
|
639
|
+
end
|
640
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
module VandalismDetection
|
3
|
+
# @abstract Exceptions raised by Wikipedia::VandalismDetection inherit from
|
4
|
+
# this Error
|
5
|
+
class Error < StandardError; end
|
6
|
+
|
7
|
+
# Exception is raised when trying to classify without a configured
|
8
|
+
# classifier
|
9
|
+
class ClassifierNotConfiguredError < Error; end
|
10
|
+
|
11
|
+
# Exception is raised when tyring to classifiy with an unknown classifier
|
12
|
+
class ClassifierUnknownError < Error; end
|
13
|
+
|
14
|
+
# Exception is raised when trying to use features without having configured
|
15
|
+
# some
|
16
|
+
class FeaturesNotConfiguredError < Error; end
|
17
|
+
|
18
|
+
# Exception is raised when trying to use edits file without having
|
19
|
+
# configured some
|
20
|
+
class EditsFileNotConfiguredError < Error; end
|
21
|
+
|
22
|
+
# Exception is raised when trying to use annotations file without having
|
23
|
+
# configured some
|
24
|
+
class AnnotationsFileNotConfiguredError < Error; end
|
25
|
+
|
26
|
+
# Exception is raised when trying to read revisions directory without
|
27
|
+
# having configured some
|
28
|
+
class RevisionsDirectoryNotConfiguredError < Error; end
|
29
|
+
|
30
|
+
# Exception is raised when trying to classify without a configured ground
|
31
|
+
# thruth test file
|
32
|
+
class GroundTruthFileNotConfiguredError < Error; end
|
33
|
+
|
34
|
+
# Exception is raises when there is no arff file available
|
35
|
+
class ArffFileNotFoundError < Error; end
|
36
|
+
|
37
|
+
# Exception is raises when there is no ground truth file available
|
38
|
+
class GroundTruthFileNotFoundError < Error; end
|
39
|
+
|
40
|
+
# Exception is raised when an already available feature should be added to
|
41
|
+
# the arff file
|
42
|
+
class FeatureAlreadyUsedError < Error; end
|
43
|
+
|
44
|
+
# Exception is raised when a revisions text file cannot be found and loaded
|
45
|
+
class RevisionFileNotFound < Error; end
|
46
|
+
end
|
47
|
+
end
|