wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,640 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/configuration'
|
|
2
|
+
require 'wikipedia/vandalism_detection/exceptions'
|
|
3
|
+
require 'wikipedia/vandalism_detection/training_dataset'
|
|
4
|
+
require 'wikipedia/vandalism_detection/test_dataset'
|
|
5
|
+
require 'wikipedia/vandalism_detection/classifier'
|
|
6
|
+
require 'wikipedia/vandalism_detection/instances'
|
|
7
|
+
require 'weka'
|
|
8
|
+
require 'fileutils'
|
|
9
|
+
require 'csv'
|
|
10
|
+
|
|
11
|
+
module Wikipedia
|
|
12
|
+
module VandalismDetection
|
|
13
|
+
# This class provides methods for the evaluation of a
|
|
14
|
+
# Wikipedia::VandalismDetection::Classifier using the weka framwork.
|
|
15
|
+
#
|
|
16
|
+
# @example
|
|
17
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
|
18
|
+
# evaluator = Wikipedia::VandalsimDetection::Evaluator(classifier)
|
|
19
|
+
#
|
|
20
|
+
# evaluation = evaluator.cross_validate
|
|
21
|
+
# evaluation = evaluator.cross_validate(equally_distributed: true)
|
|
22
|
+
#
|
|
23
|
+
# puts evaluation[:precision]
|
|
24
|
+
# puts evaluation[:recall]
|
|
25
|
+
# puts evaluation[:area_under_prc]
|
|
26
|
+
class Evaluator
|
|
27
|
+
DEFAULT_SAMPLE_COUNT = 200
|
|
28
|
+
DEFAULTS = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS
|
|
29
|
+
|
|
30
|
+
def initialize(classifier)
|
|
31
|
+
unless classifier.is_a?(Wikipedia::VandalismDetection::Classifier)
|
|
32
|
+
message = 'The classifier argument has to be an instance of ' \
|
|
33
|
+
'Wikipedia::VandalismDetection::Classifier'
|
|
34
|
+
raise ArgumentError, message
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
@config = Wikipedia::VandalismDetection.config
|
|
38
|
+
@classifier = classifier
|
|
39
|
+
@classifier_instance = classifier.classifier_instance
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Cross validates the classifier.
|
|
43
|
+
# Fold is used as defined in configuration (default is 10).
|
|
44
|
+
#
|
|
45
|
+
# @example
|
|
46
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
|
47
|
+
# evaluation = classifier.cross_validate
|
|
48
|
+
# evaluation = classifier.cross_validate(equally_distributed: true)
|
|
49
|
+
#
|
|
50
|
+
def cross_validate(options = {})
|
|
51
|
+
equally_distributed = options[:equally_distributed]
|
|
52
|
+
|
|
53
|
+
fold_defaults = DEFAULTS['classifier']['cross-validation-fold']
|
|
54
|
+
fold = @config.cross_validation_fold || fold_defaults
|
|
55
|
+
|
|
56
|
+
if equally_distributed
|
|
57
|
+
cross_validate_equally_distributed(fold)
|
|
58
|
+
else
|
|
59
|
+
cross_validate_all_instances(fold)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Returns a Hash comprising the evaluation curve data Arrays for precision, recall
|
|
64
|
+
#
|
|
65
|
+
# @example
|
|
66
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
|
67
|
+
# evaluator = classifier.evaluator
|
|
68
|
+
# or
|
|
69
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
|
70
|
+
# evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
|
71
|
+
#
|
|
72
|
+
# curve_data = evaluator.curve_data
|
|
73
|
+
#
|
|
74
|
+
# curve_data[:precision]
|
|
75
|
+
# # => [0.76, ..., 0.91]
|
|
76
|
+
#
|
|
77
|
+
# curve_data[:recall]
|
|
78
|
+
# # => [0.87, ..., 0.89]
|
|
79
|
+
#
|
|
80
|
+
# curve_data[:area_under_prc]
|
|
81
|
+
# # => 0.83
|
|
82
|
+
def curve_data(options = {})
|
|
83
|
+
evaluations = cross_validate(options)
|
|
84
|
+
threshold_curve = Weka::Classifiers::Evaluation::ThresholdCurve.new
|
|
85
|
+
|
|
86
|
+
evaluation_data = evaluations.is_a?(Array) ? evaluations[0] : evaluations
|
|
87
|
+
|
|
88
|
+
instances = threshold_curve.curve(
|
|
89
|
+
evaluation_data.predictions,
|
|
90
|
+
Instances::VANDALISM_CLASS_INDEX
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
precision = instances.attribute_to_double_array(0).to_a
|
|
94
|
+
recall = instances.attribute_to_double_array(1).to_a
|
|
95
|
+
area_under_prc = evaluation_data.area_under_prc(Instances::VANDALISM_CLASS_INDEX)
|
|
96
|
+
|
|
97
|
+
{
|
|
98
|
+
precision: precision,
|
|
99
|
+
recall: recall,
|
|
100
|
+
area_under_prc: area_under_prc
|
|
101
|
+
}
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Evaluates the classification of the configured test corpus against the
|
|
105
|
+
# given ground truth.
|
|
106
|
+
# Runs the file creation automatically unless the classification file
|
|
107
|
+
# exists, yet.
|
|
108
|
+
#
|
|
109
|
+
# Number of samples to use can be set by 'sample_count: <number>'
|
|
110
|
+
# option. Default number of samples is 100.
|
|
111
|
+
#
|
|
112
|
+
# Returns a Hash with values:
|
|
113
|
+
# :recalls - recall values
|
|
114
|
+
# :precisions - precision values
|
|
115
|
+
# :fp_rates - fals positive rate values
|
|
116
|
+
# :auprc - area under precision recall curve
|
|
117
|
+
# :auroc - area under receiver operator curve
|
|
118
|
+
# :total_recall - overall classifier recall value
|
|
119
|
+
# :total_precision - overall classifier precision value
|
|
120
|
+
#
|
|
121
|
+
# @example
|
|
122
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
|
123
|
+
# evaluator = classifier.evaluator
|
|
124
|
+
# or
|
|
125
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
|
126
|
+
# evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
|
127
|
+
#
|
|
128
|
+
# evaluator.evaluate_testcorpus_classification
|
|
129
|
+
# evaluator.evaluate_testcorpus_classification(sample_count: 50)
|
|
130
|
+
#
|
|
131
|
+
def evaluate_testcorpus_classification(options = {})
|
|
132
|
+
ground_truth_file_path = @config.test_corpus_ground_truth_file
|
|
133
|
+
|
|
134
|
+
unless ground_truth_file_path
|
|
135
|
+
message = 'Ground truth file path has to be set for test set evaluation'
|
|
136
|
+
raise GroundTruthFileNotConfiguredError, message
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
unless File.exist?(ground_truth_file_path)
|
|
140
|
+
message = 'Configured ground truth file is not available.'
|
|
141
|
+
raise GroundTruthFileNotFoundError, message
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
|
145
|
+
create_testcorpus_classification_file!(@config.test_output_classification_file, ground_truth)
|
|
146
|
+
classification = classification_hash(@config.test_output_classification_file)
|
|
147
|
+
|
|
148
|
+
sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
|
|
149
|
+
curves = test_performance_curves(ground_truth, classification, sample_count)
|
|
150
|
+
precision_recall = maximum_precision_recall(curves[:precisions], curves[:recalls])
|
|
151
|
+
|
|
152
|
+
curves[:total_recall] = precision_recall[:recall]
|
|
153
|
+
curves[:total_precision] = precision_recall[:precision]
|
|
154
|
+
|
|
155
|
+
curves
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Returns the performance curve points (recall, precision, fp-rate) and
|
|
159
|
+
# computed area under curves.
|
|
160
|
+
def test_performance_curves(ground_truth, classification, sample_count)
|
|
161
|
+
thresholds = (0.0...1.0).step(1.0 / sample_count.to_f).to_a
|
|
162
|
+
|
|
163
|
+
# remove first value to not use the [0,1] value in curve
|
|
164
|
+
thresholds.shift
|
|
165
|
+
|
|
166
|
+
precisions = []
|
|
167
|
+
recalls = []
|
|
168
|
+
fp_rates = []
|
|
169
|
+
|
|
170
|
+
thresholds.each do |threshold|
|
|
171
|
+
values = predictive_values(ground_truth, classification, threshold)
|
|
172
|
+
performance_params = performance_parameters(
|
|
173
|
+
values[:tp],
|
|
174
|
+
values[:fp],
|
|
175
|
+
values[:tn],
|
|
176
|
+
values[:fn]
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
precisions.push performance_params[:precision]
|
|
180
|
+
recalls.push performance_params[:recall]
|
|
181
|
+
fp_rates.push performance_params[:fp_rate]
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
tp_rates = recalls
|
|
185
|
+
pr_sorted = sort_curve_values(recalls, precisions, x: 0.0, y: 0.0)
|
|
186
|
+
roc_sorted = sort_curve_values(fp_rates, tp_rates, y: 0.0, x: 1.0)
|
|
187
|
+
|
|
188
|
+
recalls = pr_sorted[:x]
|
|
189
|
+
precisions = pr_sorted[:y]
|
|
190
|
+
fp_rates = roc_sorted[:x]
|
|
191
|
+
tp_rates = roc_sorted[:y]
|
|
192
|
+
|
|
193
|
+
pr_auc = area_under_curve(recalls, precisions)
|
|
194
|
+
roc_auc = area_under_curve(fp_rates, tp_rates)
|
|
195
|
+
|
|
196
|
+
{
|
|
197
|
+
precisions: precisions, recalls: recalls,
|
|
198
|
+
fp_rates: fp_rates, tp_rates: tp_rates,
|
|
199
|
+
pr_auc: pr_auc, roc_auc: roc_auc
|
|
200
|
+
}
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Returns the predictive values hash (TP,FP, TN, FN) for a certain
|
|
204
|
+
# threshold.
|
|
205
|
+
def predictive_values(ground_truth, classification, threshold)
|
|
206
|
+
tp = 0 # vandalism which is classified as vandalism
|
|
207
|
+
fp = 0 # regular that is classified as vandalism
|
|
208
|
+
tn = 0 # regular that is classified as regular
|
|
209
|
+
fn = 0 # vandalism that is classified as regular
|
|
210
|
+
|
|
211
|
+
ground_truth.each do |sample|
|
|
212
|
+
values = sample[1]
|
|
213
|
+
target_class = values[:class]
|
|
214
|
+
|
|
215
|
+
key = :"#{values[:old_revision_id]}-#{values[:new_revision_id]}"
|
|
216
|
+
# go on if annotated is not in classification
|
|
217
|
+
next unless classification.key?(key)
|
|
218
|
+
|
|
219
|
+
confidence = classification[key][:confidence]
|
|
220
|
+
|
|
221
|
+
tp += 1 if Evaluator.true_positive?(target_class, confidence, threshold) # True Positives
|
|
222
|
+
fn += 1 if Evaluator.false_negative?(target_class, confidence, threshold) # False Negatives
|
|
223
|
+
fp += 1 if Evaluator.false_positive?(target_class, confidence, threshold) # False Positives
|
|
224
|
+
tn += 1 if Evaluator.true_negative?(target_class, confidence, threshold) # True Negatives
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
{ tp: tp, fp: fp, tn: tn, fn: fn }
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Returns whether the given confidence value represents a
|
|
231
|
+
# true positive (TP) regarding the given target class and threshold.
|
|
232
|
+
def self.true_positive?(target_class, confidence, threshold)
|
|
233
|
+
target_class == Instances::VANDALISM_SHORT && confidence.to_f > threshold.to_f
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Returns whether the given confidence value represents a
|
|
237
|
+
# true negative (TN) regarding the given target class and threshold.
|
|
238
|
+
def self.true_negative?(target_class, confidence, threshold)
|
|
239
|
+
target_class == Instances::REGULAR_SHORT && confidence.to_f < threshold.to_f
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Returns whether the given confidence value represents a
|
|
243
|
+
# false positive (FP) regarding the given target class and threshold.
|
|
244
|
+
def self.false_positive?(target_class, confidence, threshold)
|
|
245
|
+
target_class == Instances::REGULAR_SHORT && confidence.to_f >= threshold.to_f
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Returns whether the given confidence value represents a
|
|
249
|
+
# false negative (FN) regarding the given target class and threshold.
|
|
250
|
+
def self.false_negative?(target_class, confidence, threshold)
|
|
251
|
+
target_class == Instances::VANDALISM_SHORT && confidence.to_f <= threshold.to_f
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# Returns a hash with performance parameters computed from given
|
|
255
|
+
# TP, FP, TN, FN
|
|
256
|
+
def performance_parameters(tp, fp, tn, fn)
|
|
257
|
+
precision = (tp + fp).zero? ? 1.0 : tp.to_f / (tp.to_f + fp.to_f)
|
|
258
|
+
recall = (tp + fn).zero? ? 1.0 : tp.to_f / (tp.to_f + fn.to_f)
|
|
259
|
+
fp_rate = (fp + tn).zero? ? 1.0 : fp.to_f / (fp.to_f + tn.to_f)
|
|
260
|
+
|
|
261
|
+
{
|
|
262
|
+
precision: precision,
|
|
263
|
+
recall: recall,
|
|
264
|
+
fp_rate: fp_rate
|
|
265
|
+
}
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# Returns the calculated area under curve for given point values
|
|
269
|
+
# x and y values has to be float arrays of the same length.
|
|
270
|
+
def area_under_curve(x_values, y_values)
|
|
271
|
+
unless x_values.count == y_values.count
|
|
272
|
+
raise ArgumentError, 'x and y values must have the same length!'
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
sum = 0.0
|
|
276
|
+
last_index = x_values.size - 1
|
|
277
|
+
|
|
278
|
+
# trapezoid area formular: A = 1/2 * (b1 + b2) * h
|
|
279
|
+
x_values.each_with_index do |x, index|
|
|
280
|
+
break if index == last_index
|
|
281
|
+
|
|
282
|
+
h = x_values[index + 1] - x
|
|
283
|
+
b1 = y_values[index]
|
|
284
|
+
b2 = y_values[index + 1]
|
|
285
|
+
|
|
286
|
+
sum += 0.5 * (b1 + b2) * h
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
sum.abs
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Returns given value array sorted by first array (x_values)
|
|
293
|
+
# Return value is a Hash { x: <x_values_sorted>, y: <y_values_sorted_by_x> }
|
|
294
|
+
# start_value is added in front of arrays if set, e.g. {x: 0.0, y: 1.0}
|
|
295
|
+
# end_values is added to end of arrays if set, e.g. {x: 1.0, y: 1.0 }
|
|
296
|
+
#
|
|
297
|
+
# @example
|
|
298
|
+
# evaluator.sort_curve_values(x, y, { x: 0.0, y: 0.0 }, { x: 1.0, y: 1.0 })
|
|
299
|
+
# #=>Hash { x: [0.0, *x, 1.0], y: [0.0, *y, 1.0] }
|
|
300
|
+
def sort_curve_values(x_values, y_values, start_values = nil, end_values = nil)
|
|
301
|
+
merge_sorted = x_values.each_with_index.map { |x, index| [x, y_values[index]] }
|
|
302
|
+
merge_sorted = merge_sorted.sort_by { |values| [values[0], - values[1]] }.uniq
|
|
303
|
+
|
|
304
|
+
x = merge_sorted.transpose[0]
|
|
305
|
+
y = merge_sorted.transpose[1]
|
|
306
|
+
|
|
307
|
+
start_values_set = start_values && (start_values.key?(:x) || start_values.key?(:y))
|
|
308
|
+
end_values_set = end_values && (end_values.key?(:x) || end_values.key?(:y))
|
|
309
|
+
|
|
310
|
+
if start_values_set
|
|
311
|
+
unless x.first == start_values[:x] && y.first == start_values[:y]
|
|
312
|
+
x.unshift(start_values[:x] || x.first)
|
|
313
|
+
y.unshift(start_values[:y] || y.first)
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
if end_values_set
|
|
318
|
+
unless x.last == end_values[:x] && y.last == end_values[:y]
|
|
319
|
+
x.push(end_values[:x] || x.last)
|
|
320
|
+
y.push(end_values[:y] || y.last)
|
|
321
|
+
end
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
{ x: x, y: y }
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Returns the maximum precision recall pair
|
|
328
|
+
def maximum_precision_recall(precisions, recalls)
|
|
329
|
+
areas = precisions.each_with_index.map do |precision, index|
|
|
330
|
+
[precision * recalls[index], index]
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
# remove arrays with NaN values
|
|
334
|
+
areas.select! { |b| b.all? { |f| !f.to_f.nan? } }
|
|
335
|
+
max_index = areas.sort.max[1]
|
|
336
|
+
|
|
337
|
+
{ precision: precisions[max_index], recall: recalls[max_index] }
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Creates the test corpus text file by classifying the configured test
|
|
341
|
+
# samples. All sub steps (as creating the test arff file, etc.) are run
|
|
342
|
+
# automatically if needed.
|
|
343
|
+
def create_testcorpus_classification_file!(file_path, ground_truth_data)
|
|
344
|
+
if ground_truth_data.nil?
|
|
345
|
+
raise ArgumentError, 'Ground truth data hash is not allowed to be nil'
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
dataset = TestDataset.build!
|
|
349
|
+
|
|
350
|
+
dir_name = File.dirname(file_path)
|
|
351
|
+
FileUtils.mkdir_p(dir_name) unless Dir.exist?(dir_name)
|
|
352
|
+
file = File.open(file_path, 'w')
|
|
353
|
+
|
|
354
|
+
feature_names = dataset.attribute_names.map(&:upcase)[0...-2]
|
|
355
|
+
header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *feature_names].join(' ')
|
|
356
|
+
|
|
357
|
+
file.puts header
|
|
358
|
+
|
|
359
|
+
dataset.to_m.to_a.each do |instance|
|
|
360
|
+
features = instance[0...-3]
|
|
361
|
+
old_revision_id = instance[-3].to_i
|
|
362
|
+
new_revision_id = instance[-2].to_i
|
|
363
|
+
ground_truth_class_name = Instances::CLASSES_SHORT[Instances::CLASSES.key(instance[-1])]
|
|
364
|
+
|
|
365
|
+
classification = @classifier.classify(features, return_all_params: true)
|
|
366
|
+
|
|
367
|
+
if classification[:class_index] == Instances::VANDALISM_CLASS_INDEX
|
|
368
|
+
class_value = 1.0
|
|
369
|
+
elsif classification[:class_index] == Instances::REGULAR_CLASS_INDEX
|
|
370
|
+
class_value = 0.0
|
|
371
|
+
else
|
|
372
|
+
class_value = Features::MISSING_VALUE
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
confidence = classification[:confidence] || class_value
|
|
376
|
+
|
|
377
|
+
must_be_inverted = @config.use_occ? && !!(@classifier.classifier_instance.options =~ /#{Instances::VANDALISM}/)
|
|
378
|
+
confidence_value = must_be_inverted ? 1.0 - confidence : confidence
|
|
379
|
+
features = features.join(' ').gsub(Float::NAN.to_s, Features::MISSING_VALUE).split
|
|
380
|
+
|
|
381
|
+
file.puts [
|
|
382
|
+
old_revision_id,
|
|
383
|
+
new_revision_id,
|
|
384
|
+
ground_truth_class_name,
|
|
385
|
+
confidence_value,
|
|
386
|
+
*features
|
|
387
|
+
].join(' ')
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
file.close
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
# Returns a hash comprising each feature's predictive values analysis for
|
|
394
|
+
# different thresholds.
|
|
395
|
+
# The Hash structure is the following one:
|
|
396
|
+
# {
|
|
397
|
+
# feature_name_1:
|
|
398
|
+
# {
|
|
399
|
+
# 0.0 => {fp: , fn: , tp: , tn: },
|
|
400
|
+
# ... => {fp: , fn: , tp: , tn: },
|
|
401
|
+
# 1.0 => {fp: , fn: , tp: , tn: }
|
|
402
|
+
# },
|
|
403
|
+
# ...,
|
|
404
|
+
# feature_name_n:
|
|
405
|
+
# {
|
|
406
|
+
# 0.0 => {fp: , fn: , tp: , tn: },
|
|
407
|
+
# ... => {fp: , fn: , tp: , tn: },
|
|
408
|
+
# 1.0 => {fp: , fn: , tp: , tn: }
|
|
409
|
+
# },
|
|
410
|
+
# }
|
|
411
|
+
def feature_analysis(options = {})
|
|
412
|
+
sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
|
|
413
|
+
thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
|
|
414
|
+
|
|
415
|
+
ground_truth_file_path = @config.test_corpus_ground_truth_file
|
|
416
|
+
training_dataset = TrainingDataset.instances
|
|
417
|
+
test_dataset = TestDataset.build!
|
|
418
|
+
|
|
419
|
+
analysis = {}
|
|
420
|
+
|
|
421
|
+
@config.features.each_with_index do |feature_name, index|
|
|
422
|
+
puts "analyzing feature… '#{feature_name}'"
|
|
423
|
+
|
|
424
|
+
dataset = filter_single_attribute(training_dataset, index)
|
|
425
|
+
print ' | train classifier with feature data…'
|
|
426
|
+
classifier = Classifier.new(dataset)
|
|
427
|
+
print "done \n"
|
|
428
|
+
|
|
429
|
+
classification = classification_data(classifier, test_dataset)
|
|
430
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
|
431
|
+
|
|
432
|
+
values = {}
|
|
433
|
+
|
|
434
|
+
thresholds.each do |threshold|
|
|
435
|
+
values[threshold] = predictive_values(ground_truth, classification, threshold)
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
analysis[feature_name] = values
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
analysis
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
# Returns a hash comprising the classifiers predictive values for using
|
|
445
|
+
# all configured features for different thresholds.
|
|
446
|
+
def full_analysis(options = {})
|
|
447
|
+
sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
|
|
448
|
+
thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
|
|
449
|
+
|
|
450
|
+
ground_truth_file_path = @config.test_corpus_ground_truth_file
|
|
451
|
+
|
|
452
|
+
puts 'train classifier…'
|
|
453
|
+
classifier = Classifier.new
|
|
454
|
+
|
|
455
|
+
test_dataset = TestDataset.build!
|
|
456
|
+
|
|
457
|
+
puts 'computing classification…'
|
|
458
|
+
classification = classification_data(classifier, test_dataset)
|
|
459
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
|
460
|
+
|
|
461
|
+
analysis = {}
|
|
462
|
+
|
|
463
|
+
thresholds.each do |threshold|
|
|
464
|
+
analysis[threshold] = predictive_values(ground_truth, classification, threshold)
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
print "done\n"
|
|
468
|
+
analysis
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
private
|
|
472
|
+
|
|
473
|
+
# Returns a dataset only holding the attribute at the given index.
|
|
474
|
+
# Weka Unsupervised Attribute Remove filter is used.
|
|
475
|
+
def filter_single_attribute(dataset, attribute_index)
|
|
476
|
+
filter = Weka::Filters::Unsupervised::Attribute::Remove.new
|
|
477
|
+
filter.use_options("-V -R #{attribute_index + 1},#{dataset.class_index + 1}")
|
|
478
|
+
|
|
479
|
+
filtered = filter.filter(dataset)
|
|
480
|
+
filtered.class_index = filtered.attributes_count - 1
|
|
481
|
+
filtered
|
|
482
|
+
end
|
|
483
|
+
|
|
484
|
+
# Returns an array of classification confidences of the test corpus'
|
|
485
|
+
# classification with the given classifier
|
|
486
|
+
def classification_data(classifier, test_dataset)
|
|
487
|
+
classification = {}
|
|
488
|
+
|
|
489
|
+
test_dataset.to_m.to_a.each do |instance|
|
|
490
|
+
features = instance[0...-3]
|
|
491
|
+
|
|
492
|
+
old_revision_id = instance[-3].to_i
|
|
493
|
+
new_revision_id = instance[-2].to_i
|
|
494
|
+
|
|
495
|
+
params = classifier.classify(features, return_all_params: true)
|
|
496
|
+
class_short_name = Instances::CLASSES_SHORT[params[:class_index]]
|
|
497
|
+
|
|
498
|
+
must_be_inverted = @config.use_occ? && @classifier.classifier_instance.options !~ /#{Instances::VANDALISM}/
|
|
499
|
+
confidence = must_be_inverted ? 1.0 - params[:confidence] : params[:confidence]
|
|
500
|
+
|
|
501
|
+
classification[:"#{old_revision_id}-#{new_revision_id}"] = {
|
|
502
|
+
old_revision_id: old_revision_id,
|
|
503
|
+
new_revision_id: new_revision_id,
|
|
504
|
+
class: class_short_name,
|
|
505
|
+
confidence: confidence
|
|
506
|
+
}
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
classification
|
|
510
|
+
end
|
|
511
|
+
|
|
512
|
+
# Returns a hash for classification data from given classification file
|
|
513
|
+
def classification_hash(classification_file)
|
|
514
|
+
file = File.read(classification_file)
|
|
515
|
+
classification_samples = file.lines.to_a
|
|
516
|
+
classification_samples.shift # remove header line
|
|
517
|
+
|
|
518
|
+
classification = {}
|
|
519
|
+
|
|
520
|
+
classification_samples.each do |line|
|
|
521
|
+
line_parts = line.split(' ')
|
|
522
|
+
|
|
523
|
+
old_revision_id = line_parts[0].to_i
|
|
524
|
+
new_revision_id = line_parts[1].to_i
|
|
525
|
+
class_short = line_parts[2]
|
|
526
|
+
confidence = line_parts[3].to_f
|
|
527
|
+
|
|
528
|
+
classification[:"#{old_revision_id}-#{new_revision_id}"] = {
|
|
529
|
+
old_revision_id: old_revision_id,
|
|
530
|
+
new_revision_id: new_revision_id,
|
|
531
|
+
class: class_short,
|
|
532
|
+
confidence: confidence
|
|
533
|
+
}
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
classification
|
|
537
|
+
end
|
|
538
|
+
|
|
539
|
+
# Returns a hash for classification data from given ground truth file
|
|
540
|
+
def ground_truth_hash(ground_truth_file)
|
|
541
|
+
file = File.read(ground_truth_file)
|
|
542
|
+
ground_truth_samples = file.lines.to_a
|
|
543
|
+
|
|
544
|
+
ground_truth = {}
|
|
545
|
+
|
|
546
|
+
ground_truth_samples.each do |line|
|
|
547
|
+
line_parts = line.split(' ')
|
|
548
|
+
|
|
549
|
+
old_revision_id = line_parts[0].to_i
|
|
550
|
+
new_revision_id = line_parts[1].to_i
|
|
551
|
+
class_short = line_parts[2]
|
|
552
|
+
|
|
553
|
+
ground_truth[:"#{old_revision_id}-#{new_revision_id}"] = {
|
|
554
|
+
old_revision_id: old_revision_id,
|
|
555
|
+
new_revision_id: new_revision_id,
|
|
556
|
+
class: class_short
|
|
557
|
+
}
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
ground_truth
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
# Cross validates classifier over full dataset with <fold>-fold cross
|
|
564
|
+
# validation
|
|
565
|
+
def cross_validate_all_instances(fold)
|
|
566
|
+
@classifier_instance.cross_validate(folds: fold)
|
|
567
|
+
rescue => error
|
|
568
|
+
raise "Error while cross validation: #{error}"
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
# Cross validates classifier over equally distributed dataset with
|
|
572
|
+
# <fold>-fold cross validation
|
|
573
|
+
def cross_validate_equally_distributed(fold)
|
|
574
|
+
dirname = @config.output_base_directory
|
|
575
|
+
FileUtils.mkdir(dirname) unless Dir.exist?(dirname)
|
|
576
|
+
|
|
577
|
+
file_name = 'cross_validation_eq_distr.txt'
|
|
578
|
+
file_path = File.join(dirname, file_name)
|
|
579
|
+
|
|
580
|
+
puts "Writing to #{file_path}…"
|
|
581
|
+
result_file = File.open(file_path, 'a')
|
|
582
|
+
|
|
583
|
+
begin
|
|
584
|
+
time = Time.now.strftime('%Y-%m-%d %H:%M')
|
|
585
|
+
type = @config.classifier_type
|
|
586
|
+
options = @config.classifier_options || 'default'
|
|
587
|
+
result_file.puts "\nCROSS VALIDATION - #{fold} fold (Classifier: #{type}, options: #{options} ) | #{time}"
|
|
588
|
+
result_file.puts "Features: \n\t#{@config.features.join("\n\t")}\n\n"
|
|
589
|
+
|
|
590
|
+
evaluations = []
|
|
591
|
+
|
|
592
|
+
times = 10
|
|
593
|
+
|
|
594
|
+
# run n times validation
|
|
595
|
+
(1..times).each do |i|
|
|
596
|
+
uniform_dataset = TrainingDataset.balanced_instances
|
|
597
|
+
|
|
598
|
+
print "\rcross validate dataset (equally distributed)… #{i}/#{times} | instances: #{uniform_dataset.size}"
|
|
599
|
+
@classifier_instance.train_with_instances(uniform_dataset)
|
|
600
|
+
evaluations << @classifier_instance.cross_validate(folds: fold)
|
|
601
|
+
|
|
602
|
+
if (i % (times / 10)).zero?
|
|
603
|
+
print_evaluation_data(evaluations, result_file, i)
|
|
604
|
+
end
|
|
605
|
+
end
|
|
606
|
+
|
|
607
|
+
#evaluation_data_of(evaluations)
|
|
608
|
+
evaluations
|
|
609
|
+
rescue => error
|
|
610
|
+
raise "Error while cross validation for equally distributed instances: #{error}"
|
|
611
|
+
ensure
|
|
612
|
+
result_file.close
|
|
613
|
+
puts "\nThe evaluation results has been saved to #{file_path}"
|
|
614
|
+
end
|
|
615
|
+
end
|
|
616
|
+
|
|
617
|
+
# Returns the evaluation data average value hash of the given evaluations.
|
|
618
|
+
def evaluation_data_of(evaluations)
|
|
619
|
+
class_index = Instances::VANDALISM_CLASS_INDEX
|
|
620
|
+
total_count = evaluations.count.to_f
|
|
621
|
+
|
|
622
|
+
recall = evaluations.reduce(0.0) { |result, sample| result + sample.recall(class_index) } / total_count
|
|
623
|
+
precision = evaluations.reduce(0.0) { |result, sample| result + sample.precision(class_index) } / total_count
|
|
624
|
+
area_under_prc = evaluations.reduce(0.0) { |result, sample| result + sample.area_under_prc(class_index) } / total_count
|
|
625
|
+
|
|
626
|
+
{
|
|
627
|
+
precision: precision,
|
|
628
|
+
recall: recall,
|
|
629
|
+
area_under_prc: area_under_prc
|
|
630
|
+
}
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
# Prints data to file
|
|
634
|
+
def print_evaluation_data(evaluations, file, index)
|
|
635
|
+
data = evaluation_data_of(evaluations)
|
|
636
|
+
file.puts "#{index}\tprecision: #{data[:precision]} | recall: #{data[:recall]} | Area under PRC: #{data[:area_under_prc]}"
|
|
637
|
+
end
|
|
638
|
+
end
|
|
639
|
+
end
|
|
640
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
module Wikipedia
|
|
2
|
+
module VandalismDetection
|
|
3
|
+
# @abstract Exceptions raised by Wikipedia::VandalismDetection inherit from
|
|
4
|
+
# this Error
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Exception is raised when trying to classify without a configured
|
|
8
|
+
# classifier
|
|
9
|
+
class ClassifierNotConfiguredError < Error; end
|
|
10
|
+
|
|
11
|
+
# Exception is raised when tyring to classifiy with an unknown classifier
|
|
12
|
+
class ClassifierUnknownError < Error; end
|
|
13
|
+
|
|
14
|
+
# Exception is raised when trying to use features without having configured
|
|
15
|
+
# some
|
|
16
|
+
class FeaturesNotConfiguredError < Error; end
|
|
17
|
+
|
|
18
|
+
# Exception is raised when trying to use edits file without having
|
|
19
|
+
# configured some
|
|
20
|
+
class EditsFileNotConfiguredError < Error; end
|
|
21
|
+
|
|
22
|
+
# Exception is raised when trying to use annotations file without having
|
|
23
|
+
# configured some
|
|
24
|
+
class AnnotationsFileNotConfiguredError < Error; end
|
|
25
|
+
|
|
26
|
+
# Exception is raised when trying to read revisions directory without
|
|
27
|
+
# having configured some
|
|
28
|
+
class RevisionsDirectoryNotConfiguredError < Error; end
|
|
29
|
+
|
|
30
|
+
# Exception is raised when trying to classify without a configured ground
|
|
31
|
+
# thruth test file
|
|
32
|
+
class GroundTruthFileNotConfiguredError < Error; end
|
|
33
|
+
|
|
34
|
+
# Exception is raises when there is no arff file available
|
|
35
|
+
class ArffFileNotFoundError < Error; end
|
|
36
|
+
|
|
37
|
+
# Exception is raises when there is no ground truth file available
|
|
38
|
+
class GroundTruthFileNotFoundError < Error; end
|
|
39
|
+
|
|
40
|
+
# Exception is raised when an already available feature should be added to
|
|
41
|
+
# the arff file
|
|
42
|
+
class FeatureAlreadyUsedError < Error; end
|
|
43
|
+
|
|
44
|
+
# Exception is raised when a revisions text file cannot be found and loaded
|
|
45
|
+
class RevisionFileNotFound < Error; end
|
|
46
|
+
end
|
|
47
|
+
end
|