wikipedia-vandalism_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +4 -0
- data/README.md +265 -0
- data/Rakefile +12 -0
- data/lib/java/LibSVM.jar +0 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/libsvm.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/functions/lib_svm.rb +15 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +25 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +17 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +18 -0
- data/lib/weka/filters/supervised/instance/smote.rb +22 -0
- data/lib/wikipedia.rb +51 -0
- data/lib/wikipedia/vandalism_detection.rb +30 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +18 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +69 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +186 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +321 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +27 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +75 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +606 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +40 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +89 -0
- data/lib/wikipedia/vandalism_detection/features.rb +67 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +54 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +65 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/user_reputation.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +22 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +130 -0
- data/lib/wikipedia/vandalism_detection/page.rb +88 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +52 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +69 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +43 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +367 -0
- data/lib/wikipedia/vandalism_detection/text.rb +18 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +303 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +12 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +21 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +22 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +12 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +15 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +12 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +97 -0
- data/spec/factories/edit.rb +20 -0
- data/spec/factories/page.rb +13 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/config.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/macros/file_reading.rb +7 -0
- data/spec/support/macros/test_configuration.rb +71 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +36 -0
- data/spec/vandalism_detection/classifier_spec.rb +317 -0
- data/spec/vandalism_detection/configuration_spec.rb +517 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +137 -0
- data/spec/vandalism_detection/evaluator_spec.rb +671 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +128 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +58 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +61 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +23 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +35 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/base_spec.rb +49 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +58 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +38 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +37 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +27 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +42 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +33 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +33 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +49 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +51 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +26 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +41 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +46 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +35 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +35 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +35 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +35 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +35 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +26 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +46 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +35 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +35 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +44 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +28 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +46 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +60 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +35 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +57 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +38 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +50 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +22 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +37 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/user_reputation_spec.rb +52 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +58 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +22 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +35 -0
- data/spec/vandalism_detection/instances_spec.rb +156 -0
- data/spec/vandalism_detection/page_parser_spec.rb +184 -0
- data/spec/vandalism_detection/page_spec.rb +135 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +115 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +231 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +264 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +72 -0
- data/spec/weka/classifiers/functions/lib_svm_spec.rb +38 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +76 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +40 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +40 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +6 -0
- data/wikipedia-vandalism_detection.gemspec +30 -0
- metadata +512 -0
@@ -0,0 +1,27 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
module VandalismDetection
|
3
|
+
|
4
|
+
require 'java'
|
5
|
+
require 'java/diffutils-1.3.0.jar'
|
6
|
+
|
7
|
+
java_import 'difflib.DiffUtils'
|
8
|
+
|
9
|
+
class Diff
|
10
|
+
|
11
|
+
def initialize(original, current)
|
12
|
+
@original = original.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
13
|
+
@current = current.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
14
|
+
|
15
|
+
@patch = DiffUtils.diff @original.split, @current.split
|
16
|
+
end
|
17
|
+
|
18
|
+
def inserted_words
|
19
|
+
@patch.deltas.map {|delta| delta.revised.lines }.flatten
|
20
|
+
end
|
21
|
+
|
22
|
+
def removed_words
|
23
|
+
@patch.deltas.map {|delta| delta.original.lines }.flatten
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'wikipedia/vandalism_detection/diff'
|
2
|
+
require 'wikipedia/vandalism_detection/text'
|
3
|
+
require 'wikipedia/vandalism_detection/page'
|
4
|
+
|
5
|
+
module Wikipedia
|
6
|
+
module VandalismDetection
|
7
|
+
class Edit
|
8
|
+
|
9
|
+
attr_reader :old_revision, :new_revision
|
10
|
+
attr_accessor :page
|
11
|
+
|
12
|
+
def initialize(old_revision, new_revision, attributes = {})
|
13
|
+
message = "old revision: #{old_revision.id} | parent: #{old_revision.parent_id},
|
14
|
+
new revision: #{new_revision.id} | parent: #{new_revision.parent_id}"
|
15
|
+
|
16
|
+
raise ArgumentError, "Revisions are not sequent: #{message}." unless sequent?(old_revision, new_revision)
|
17
|
+
|
18
|
+
@old_revision = old_revision
|
19
|
+
@new_revision = new_revision
|
20
|
+
@page = attributes[:page] || Page.new
|
21
|
+
end
|
22
|
+
|
23
|
+
def serialize(*attributes)
|
24
|
+
old_revision_parts = []
|
25
|
+
new_revision_parts = []
|
26
|
+
|
27
|
+
attributes.each do |attr|
|
28
|
+
if @old_revision.respond_to?(attr)
|
29
|
+
old_revision_parts.push @old_revision.method(attr).call
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
attributes.each do |attr|
|
34
|
+
if @new_revision.respond_to?(attr)
|
35
|
+
new_revision_parts.push @new_revision.method(attr).call
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
old_revision_string = old_revision_parts.join(',')
|
40
|
+
new_revision_string = new_revision_parts.join(',')
|
41
|
+
|
42
|
+
"#{old_revision_string}\t#{new_revision_string}"
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns an array of the words inserted in the new revision compared with the old one.
|
46
|
+
def inserted_words
|
47
|
+
@diff ||= Diff.new(@old_revision.text, @new_revision.text)
|
48
|
+
@inserted_words ||= @diff.inserted_words
|
49
|
+
end
|
50
|
+
|
51
|
+
# Returns a Text of the words inserted in the new revision compared with the old one.
|
52
|
+
def inserted_text
|
53
|
+
@inserted_text ||= Text.new(inserted_words.join(' '))
|
54
|
+
end
|
55
|
+
|
56
|
+
# Returns an array of the words removed in the new revision compared with the old one.
|
57
|
+
def removed_words
|
58
|
+
@diff ||= Diff.new(@old_revision.text, @new_revision.text)
|
59
|
+
@removed_words ||= @diff.removed_words
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns a Text of the words removed in the new revision compared with the old one.
|
63
|
+
def removed_text
|
64
|
+
@removed_text ||= Text.new(removed_words.join(' '))
|
65
|
+
end
|
66
|
+
|
67
|
+
protected
|
68
|
+
|
69
|
+
# Returns whether the given revisions are sequent, i.e. the old revisions id is the the new revisions parent id.
|
70
|
+
def sequent?(old_revision, new_revision)
|
71
|
+
new_revision.parent_id == old_revision.id
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,606 @@
|
|
1
|
+
require 'wikipedia/vandalism_detection/configuration'
|
2
|
+
require 'wikipedia/vandalism_detection/exceptions'
|
3
|
+
require 'wikipedia/vandalism_detection/training_dataset'
|
4
|
+
require 'wikipedia/vandalism_detection/test_dataset'
|
5
|
+
require 'wikipedia/vandalism_detection/classifier'
|
6
|
+
require 'wikipedia/vandalism_detection/instances'
|
7
|
+
require 'ruby-band'
|
8
|
+
require 'fileutils'
|
9
|
+
require 'csv'
|
10
|
+
|
11
|
+
module Wikipedia
|
12
|
+
module VandalismDetection
|
13
|
+
|
14
|
+
# This class provides methods for the evaluation of a Wikipedia::VandalismDetection::Classifier
|
15
|
+
# using the weka framwork.
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
19
|
+
# evaluator = Wikipedia::VandalsimDetection::Evaluator(classifier)
|
20
|
+
#
|
21
|
+
# evaluation = evaluator.cross_validate
|
22
|
+
# evaluation = evaluator.cross_validate(equally_distributed: true)
|
23
|
+
#
|
24
|
+
# puts evaluation[:precision]
|
25
|
+
# puts evaluation[:recall]
|
26
|
+
# puts evaluation[:area_under_prc]
|
27
|
+
class Evaluator
|
28
|
+
|
29
|
+
DEFAULT_SAMPLE_COUNT = 200.freeze
|
30
|
+
|
31
|
+
def initialize(classifier)
|
32
|
+
raise(ArgumentError, 'Classifier param has to be a Wikipedia::VandalismDetection::Classifier instance') unless
|
33
|
+
classifier.is_a?(Wikipedia::VandalismDetection::Classifier)
|
34
|
+
|
35
|
+
@config = Wikipedia::VandalismDetection.configuration
|
36
|
+
@classifier = classifier
|
37
|
+
@classifier_instance = classifier.classifier_instance
|
38
|
+
end
|
39
|
+
|
40
|
+
# Cross validates the classifier.
|
41
|
+
# Fold is used as defined in configuration (default is 10).
|
42
|
+
#
|
43
|
+
# @example
|
44
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
45
|
+
# evaluation = classifier.cross_validate
|
46
|
+
# evaluation = classifier.cross_validate(equally_distributed: true)
|
47
|
+
#
|
48
|
+
def cross_validate(options = {})
|
49
|
+
equally_distributed = options[:equally_distributed]
|
50
|
+
|
51
|
+
fold_defaults = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS['classifier']['cross-validation-fold']
|
52
|
+
fold = (@config.cross_validation_fold || fold_defaults)
|
53
|
+
|
54
|
+
if equally_distributed
|
55
|
+
cross_validate_equally_distributed(fold)
|
56
|
+
else
|
57
|
+
cross_validate_all_instances(fold)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns a Hash comprising the evaluation curve data Arrays for precision, recall
|
62
|
+
#
|
63
|
+
# @example
|
64
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
65
|
+
# evaluator = classifier.evaluator
|
66
|
+
# or
|
67
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
68
|
+
# evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
69
|
+
#
|
70
|
+
# curve_data = evaluator.curve_data
|
71
|
+
#
|
72
|
+
# curve_data[:precision]
|
73
|
+
# # => [0.76, ..., 0.91]
|
74
|
+
#
|
75
|
+
# curve_data[:recall]
|
76
|
+
# # => [0.87, ..., 0.89]
|
77
|
+
#
|
78
|
+
# curve_data[:area_under_prc]
|
79
|
+
# # => 0.83
|
80
|
+
def curve_data(options = {})
|
81
|
+
evaluations = cross_validate(options)
|
82
|
+
threshold_curve = Weka::Classifiers::Evaluation::ThresholdCurve.new
|
83
|
+
|
84
|
+
evaluation_data = (evaluations.is_a? Array) ? evaluations[0] : evaluations
|
85
|
+
|
86
|
+
instances = threshold_curve.curve(evaluation_data.predictions, Instances::VANDALISM_CLASS_INDEX)
|
87
|
+
precision = instances.return_attr_data('Precision')
|
88
|
+
recall = instances.return_attr_data('Recall')
|
89
|
+
area_under_prc = evaluation_data.area_under_prc(Instances::VANDALISM_CLASS_INDEX)
|
90
|
+
|
91
|
+
{ precision: precision, recall: recall, area_under_prc: area_under_prc }
|
92
|
+
end
|
93
|
+
|
94
|
+
# Evaluates the classification of the configured test corpus against the given ground truth.
|
95
|
+
# Runs the file creation automatically unless the classification file exists, yet.
|
96
|
+
#
|
97
|
+
# Number of samples to use can be set by 'sample_count: <number>' parameter
|
98
|
+
# Default number of samples is 100.
|
99
|
+
#
|
100
|
+
# Returns a Hash with values:
|
101
|
+
# :recalls - recall values
|
102
|
+
# :precisions - precision values
|
103
|
+
# :fp_rates - fals positive rate values
|
104
|
+
# :auprc - area under precision recall curve
|
105
|
+
# :auroc - area under receiver operator curve
|
106
|
+
# :total_recall - overall classifier recall value
|
107
|
+
# :total_precision - overall classifier precision value
|
108
|
+
#
|
109
|
+
# @example
|
110
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
111
|
+
# evaluator = classifier.evaluator
|
112
|
+
# or
|
113
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
114
|
+
# evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
115
|
+
#
|
116
|
+
# evaluator.evaluate_testcorpus_classification
|
117
|
+
# evaluator.evaluate_testcorpus_classification(sample_count: 50)
|
118
|
+
#
|
119
|
+
def evaluate_testcorpus_classification(options = {})
|
120
|
+
ground_truth_file_path = @config.test_corpus_ground_truth_file
|
121
|
+
|
122
|
+
raise(GroundTruthFileNotConfiguredError, 'Ground truth file path has to be set for test set evaluation!') \
|
123
|
+
unless ground_truth_file_path
|
124
|
+
|
125
|
+
raise(GroundTruthFileNotFoundError, 'Configured ground truth file is not available.') \
|
126
|
+
unless File.exist?(ground_truth_file_path)
|
127
|
+
|
128
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
129
|
+
create_testcorpus_classification_file!(@config.test_output_classification_file, ground_truth)
|
130
|
+
classification = classification_hash(@config.test_output_classification_file)
|
131
|
+
|
132
|
+
sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
|
133
|
+
curves = test_performance_curves(ground_truth, classification, sample_count)
|
134
|
+
precision_recall = maximum_precision_recall(curves[:precisions], curves[:recalls])
|
135
|
+
|
136
|
+
curves[:total_recall] = precision_recall[:recall]
|
137
|
+
curves[:total_precision] = precision_recall[:precision]
|
138
|
+
|
139
|
+
curves
|
140
|
+
end
|
141
|
+
|
142
|
+
# Returns the performance curve points (recall, precision, fp-rate) and computed area under curves.
|
143
|
+
def test_performance_curves(ground_truth, classification, sample_count)
|
144
|
+
thresholds = (0.0...1.0).step(1.0 / (sample_count.to_f)).to_a
|
145
|
+
thresholds.shift #remove first value to not use the [0,1] value in curve
|
146
|
+
|
147
|
+
precisions = []
|
148
|
+
recalls = []
|
149
|
+
fp_rates = []
|
150
|
+
|
151
|
+
thresholds.each do |threshold|
|
152
|
+
values = predictive_values(ground_truth, classification, threshold)
|
153
|
+
performance_params = performance_parameters(values[:tp], values[:fp], values[:tn], values[:fn])
|
154
|
+
|
155
|
+
precisions.push performance_params[:precision]
|
156
|
+
recalls.push performance_params[:recall]
|
157
|
+
fp_rates.push performance_params[:fp_rate]
|
158
|
+
end
|
159
|
+
|
160
|
+
tp_rates = recalls
|
161
|
+
pr_sorted = sort_curve_values(recalls, precisions, { x: 0.0 }, { y: 0.0 })
|
162
|
+
roc_sorted = sort_curve_values(fp_rates, tp_rates, { y: 0.0 }, { x: 1.0 })
|
163
|
+
|
164
|
+
recalls = pr_sorted[:x]
|
165
|
+
precisions = pr_sorted[:y]
|
166
|
+
fp_rates = roc_sorted[:x]
|
167
|
+
tp_rates = roc_sorted[:y]
|
168
|
+
|
169
|
+
pr_auc = area_under_curve(recalls, precisions)
|
170
|
+
roc_auc = area_under_curve(fp_rates, tp_rates)
|
171
|
+
|
172
|
+
{ precisions: precisions, recalls: recalls,
|
173
|
+
fp_rates: fp_rates, tp_rates: tp_rates,
|
174
|
+
pr_auc: pr_auc, roc_auc: roc_auc }
|
175
|
+
end
|
176
|
+
|
177
|
+
# Returns the predictive values hash (TP,FP, TN, FN) for a certain threshold.
|
178
|
+
def predictive_values(ground_truth, classification, threshold)
|
179
|
+
tp = 0 # vandalism which is classified as vandalism
|
180
|
+
fp = 0 # regular that is classified as vandalism
|
181
|
+
tn = 0 # regular that is classified as regular
|
182
|
+
fn = 0 # vandalism that is classified as regular
|
183
|
+
|
184
|
+
ground_truth.each do |sample|
|
185
|
+
values = sample[1]
|
186
|
+
target_class = values[:class]
|
187
|
+
|
188
|
+
key = :"#{values[:old_revision_id]}-#{values[:new_revision_id]}"
|
189
|
+
next unless classification.has_key? key # go on if annotated is not in classification
|
190
|
+
|
191
|
+
confidence = classification[key][:confidence]
|
192
|
+
|
193
|
+
tp += 1 if Evaluator.true_positive?(target_class, confidence, threshold) # True Positives
|
194
|
+
fn += 1 if Evaluator.false_negative?(target_class, confidence, threshold) # False Negatives
|
195
|
+
fp += 1 if Evaluator.false_positive?(target_class, confidence, threshold) # False Positives
|
196
|
+
tn += 1 if Evaluator.true_negative?(target_class, confidence, threshold) # True Negatives
|
197
|
+
end
|
198
|
+
|
199
|
+
{ tp: tp, fp: fp, tn: tn, fn: fn }
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns whether the given confidence value represents a true positive (TP) regarding the given target class
|
203
|
+
# and threshold.
|
204
|
+
def self.true_positive?(target_class, confidence, threshold)
|
205
|
+
target_class == Instances::VANDALISM_SHORT && confidence.to_f > threshold.to_f
|
206
|
+
end
|
207
|
+
|
208
|
+
# Returns whether the given confidence value represents a true negative (TN) regarding the given target class
|
209
|
+
# and threshold.
|
210
|
+
def self.true_negative?(target_class, confidence, threshold)
|
211
|
+
target_class == Instances::REGULAR_SHORT && confidence.to_f < threshold.to_f
|
212
|
+
end
|
213
|
+
|
214
|
+
# Returns whether the given confidence value represents a false positive (FP) regarding the given target class
|
215
|
+
# and threshold.
|
216
|
+
def self.false_positive?(target_class, confidence, threshold)
|
217
|
+
target_class == Instances::REGULAR_SHORT && confidence.to_f >= threshold.to_f
|
218
|
+
end
|
219
|
+
|
220
|
+
# Returns whether the given confidence value represents a false negative (FN) regarding the given target class
|
221
|
+
# and threshold.
|
222
|
+
def self.false_negative?(target_class, confidence, threshold)
|
223
|
+
target_class == Instances::VANDALISM_SHORT && confidence.to_f <= threshold.to_f
|
224
|
+
end
|
225
|
+
|
226
|
+
# Returns a hash with performance parameters computed from given TP, FP, TN, FN
|
227
|
+
def performance_parameters(tp, fp, tn, fn)
|
228
|
+
precision = ((tp + fp == 0) ? 1.0 : (tp.to_f / (tp.to_f + fp.to_f)))
|
229
|
+
recall = ((tp + fn == 0) ? 1.0 : (tp.to_f / (tp.to_f + fn.to_f)))
|
230
|
+
fp_rate = ((fp + tn == 0) ? 1.0: (fp.to_f / (fp.to_f + tn.to_f)))
|
231
|
+
|
232
|
+
{
|
233
|
+
precision: precision,
|
234
|
+
recall: recall,
|
235
|
+
fp_rate: fp_rate
|
236
|
+
}
|
237
|
+
end
|
238
|
+
|
239
|
+
# Returns the calculated area under curve for given point values
|
240
|
+
# x and y values has to be float arrays of the same length.
|
241
|
+
def area_under_curve(x_values, y_values)
|
242
|
+
raise ArgumentError, 'x and y values must have the same length!' unless x_values.count == y_values.count
|
243
|
+
|
244
|
+
sum = 0.0
|
245
|
+
last_index = x_values.size - 1
|
246
|
+
|
247
|
+
# trapezoid area formular: A = 1/2 * (b1 + b2) * h
|
248
|
+
x_values.each_with_index do |x, index |
|
249
|
+
break if index == last_index
|
250
|
+
|
251
|
+
h = x_values[index + 1] - x
|
252
|
+
b1 = y_values[index]
|
253
|
+
b2 = y_values[index + 1]
|
254
|
+
|
255
|
+
sum += 0.5 * (b1 + b2) * h
|
256
|
+
end
|
257
|
+
|
258
|
+
sum.abs
|
259
|
+
end
|
260
|
+
|
261
|
+
# Returns given value array sorted by first array (x_values)
|
262
|
+
# Return value is a Hash { x: <x_values_sorted>, y: <y_values_sorted_by_x> }
|
263
|
+
# start_value is added in front of arrays if set, e.g. {x: 0.0, y: 1.0}
|
264
|
+
# end_values is added to end of arrays if set, e.g. {x: 1.0, y: 1.0 }
|
265
|
+
#
|
266
|
+
# @example
|
267
|
+
# evaluator.sort_curve_values(x, y, { x: 0.0, y: 0.0 }, { x: 1.0, y: 1.0 })
|
268
|
+
# #=>Hash { x: [0.0, *x, 1.0], y: [0.0, *y, 1.0] }
|
269
|
+
def sort_curve_values(x_values, y_values, start_values = nil, end_values = nil)
|
270
|
+
merge_sorted = (x_values.each_with_index.map { |x, index| [x, y_values[index]] })
|
271
|
+
merge_sorted = merge_sorted.sort_by{ |values| [values[0], - values[1]] }.uniq
|
272
|
+
|
273
|
+
x = merge_sorted.transpose[0]
|
274
|
+
y = merge_sorted.transpose[1]
|
275
|
+
|
276
|
+
start_values_set = start_values && (start_values.has_key?(:x) || start_values.has_key?(:y))
|
277
|
+
end_values_set = end_values && (end_values.has_key?(:x) || end_values.has_key?(:y))
|
278
|
+
|
279
|
+
if start_values_set
|
280
|
+
unless x.first == start_values[:x] && y.first == start_values[:y]
|
281
|
+
x.unshift(start_values[:x] || x.first)
|
282
|
+
y.unshift(start_values[:y] || y.first)
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
if end_values_set
|
287
|
+
unless x.last == end_values[:x] && y.last == end_values[:y]
|
288
|
+
x.push(end_values[:x] || x.last)
|
289
|
+
y.push(end_values[:y] || y.last)
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
{ x: x, y: y }
|
294
|
+
end
|
295
|
+
|
296
|
+
# Returns the maximum precision recall pair
|
297
|
+
def maximum_precision_recall(precisions, recalls)
|
298
|
+
areas = precisions.each_with_index.map do |precision, index|
|
299
|
+
[precision * recalls[index], index]
|
300
|
+
end
|
301
|
+
|
302
|
+
areas.reject! { |b| !b.all? { |f| !f.to_f.nan? } } # remove arrays with NaN values
|
303
|
+
max_index = areas.sort.max[1]
|
304
|
+
|
305
|
+
{ precision: precisions[max_index], recall: recalls[max_index] }
|
306
|
+
end
|
307
|
+
|
308
|
+
# Creates the test corpus text file by classifying the configured test samples
|
309
|
+
# All sub steps (as creating the test arff file, etc.) are run automattically if needed.
|
310
|
+
def create_testcorpus_classification_file!(file_path, ground_truth_data)
|
311
|
+
raise(ArgumentError, "Ground truth data hash is not allowed to be nil!") if ground_truth_data.nil?
|
312
|
+
|
313
|
+
dataset = TestDataset.build!
|
314
|
+
|
315
|
+
dir_name = File.dirname(file_path)
|
316
|
+
FileUtils.mkdir_p(dir_name) unless Dir.exists?(dir_name)
|
317
|
+
file = File.open(file_path, 'w')
|
318
|
+
|
319
|
+
feature_names = dataset.enumerate_attributes.to_a.map { |attr| attr.name.upcase }[0...-2]
|
320
|
+
header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *feature_names].join(' ')
|
321
|
+
|
322
|
+
file.puts header
|
323
|
+
|
324
|
+
dataset.to_a2d.each do |instance|
|
325
|
+
features = instance[0...-3]
|
326
|
+
old_revision_id = instance[-3].to_i
|
327
|
+
new_revision_id = instance[-2].to_i
|
328
|
+
ground_truth_class_name = Instances::CLASSES_SHORT[Instances::CLASSES.key(instance[-1])]
|
329
|
+
|
330
|
+
classification = @classifier.classify(features, return_all_params: true)
|
331
|
+
class_value = Features::MISSING_VALUE
|
332
|
+
|
333
|
+
if @config.classifier_type.match(/Functions::LibSVM/) && @config.classifier_options.match(/-s 2/i)
|
334
|
+
# LibSVM with one class has only one class during training
|
335
|
+
# Vandalism will get class index 0 while classifying
|
336
|
+
# Regular will get missing (or Instances::NOT_KNOWN_INDEX in Wikipedia::VandalismDetection::Classifier)
|
337
|
+
|
338
|
+
if classification[:class_index] == 0
|
339
|
+
class_value = 1.0
|
340
|
+
elsif classification[:class_index] == Instances::NOT_KNOWN_INDEX
|
341
|
+
class_value = 0.0
|
342
|
+
end
|
343
|
+
else
|
344
|
+
if classification[:class_index] == Instances::VANDALISM_CLASS_INDEX
|
345
|
+
class_value = 1.0
|
346
|
+
elsif classification[:class_index] == Instances::REGULAR_CLASS_INDEX
|
347
|
+
class_value = 0.0
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
confidence = classification[:confidence] || class_value
|
352
|
+
|
353
|
+
must_be_inverted = @config.use_occ? && !!(@classifier.classifier_instance.options =~ /#{Instances::VANDALISM}/)
|
354
|
+
confidence_value = must_be_inverted ? (1.0 - confidence) : confidence
|
355
|
+
features = features.join(' ').gsub(Float::NAN.to_s, Features::MISSING_VALUE).split
|
356
|
+
|
357
|
+
file.puts [old_revision_id, new_revision_id, ground_truth_class_name, confidence_value, *features].join(' ')
|
358
|
+
end
|
359
|
+
|
360
|
+
file.close
|
361
|
+
end
|
362
|
+
|
363
|
+
# Returns a hash comprising each feature's predictive values analysis for different thresholds.
|
364
|
+
# The Hash structure is the following one:
|
365
|
+
# {
|
366
|
+
# feature_name_1:
|
367
|
+
# {
|
368
|
+
# 0.0 => {fp: , fn: , tp: , tn: },
|
369
|
+
# ... => {fp: , fn: , tp: , tn: },
|
370
|
+
# 1.0 => {fp: , fn: , tp: , tn: }
|
371
|
+
# },
|
372
|
+
# ...,
|
373
|
+
# feature_name_n:
|
374
|
+
# {
|
375
|
+
# 0.0 => {fp: , fn: , tp: , tn: },
|
376
|
+
# ... => {fp: , fn: , tp: , tn: },
|
377
|
+
# 1.0 => {fp: , fn: , tp: , tn: }
|
378
|
+
# },
|
379
|
+
# }
|
380
|
+
def feature_analysis(options = {})
|
381
|
+
sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
|
382
|
+
thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
|
383
|
+
|
384
|
+
ground_truth_file_path = @config.test_corpus_ground_truth_file
|
385
|
+
training_dataset = TrainingDataset.instances
|
386
|
+
test_dataset = TestDataset.build!
|
387
|
+
|
388
|
+
analysis = {}
|
389
|
+
|
390
|
+
@config.features.each_with_index do |feature_name, index |
|
391
|
+
puts "analyzing feature... '#{feature_name}'"
|
392
|
+
|
393
|
+
dataset = filter_single_attribute(training_dataset, index)
|
394
|
+
print ' | train classifier with feature data...'
|
395
|
+
classifier = Classifier.new(dataset)
|
396
|
+
print "done \n"
|
397
|
+
|
398
|
+
classification = classification_data(classifier, test_dataset)
|
399
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
400
|
+
|
401
|
+
values = {}
|
402
|
+
|
403
|
+
thresholds.each do |threshold|
|
404
|
+
values[threshold] = predictive_values(ground_truth, classification, threshold)
|
405
|
+
end
|
406
|
+
|
407
|
+
analysis[feature_name] = values
|
408
|
+
end
|
409
|
+
|
410
|
+
analysis
|
411
|
+
end
|
412
|
+
|
413
|
+
# Returns a hash comprising the classifiers predictive values for using all configured features for
|
414
|
+
# different thresholds.
|
415
|
+
def full_analysis(options = {})
|
416
|
+
sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
|
417
|
+
thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a
|
418
|
+
|
419
|
+
ground_truth_file_path = @config.test_corpus_ground_truth_file
|
420
|
+
|
421
|
+
puts 'train classifier...'
|
422
|
+
classifier = Classifier.new
|
423
|
+
|
424
|
+
test_dataset = TestDataset.build!
|
425
|
+
|
426
|
+
puts 'computing classification...'
|
427
|
+
classification = classification_data(classifier, test_dataset)
|
428
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
429
|
+
|
430
|
+
analysis = {}
|
431
|
+
|
432
|
+
thresholds.each do |threshold|
|
433
|
+
analysis[threshold] = predictive_values(ground_truth, classification, threshold)
|
434
|
+
end
|
435
|
+
|
436
|
+
print "done \n"
|
437
|
+
analysis
|
438
|
+
end
|
439
|
+
|
440
|
+
private
|
441
|
+
|
442
|
+
# Returns a dataset only holding the attribute at the given index.
|
443
|
+
# Weka Unsupervised Attribute Remove filter is used.
|
444
|
+
def filter_single_attribute(dataset, attribute_index)
|
445
|
+
filter = Weka::Filters::Unsupervised::Attribute::Remove.new
|
446
|
+
|
447
|
+
filter.set do
|
448
|
+
data dataset
|
449
|
+
filter_options "-V -R #{attribute_index + 1},#{dataset.class_index + 1}"
|
450
|
+
end
|
451
|
+
|
452
|
+
filtered = filter.use
|
453
|
+
filtered.class_index = filtered.n_col - 1
|
454
|
+
filtered
|
455
|
+
end
|
456
|
+
|
457
|
+
# Returns an array of classification confidences of the test corpus' classification with the given classifier
|
458
|
+
def classification_data(classifier, test_dataset)
|
459
|
+
classification = {}
|
460
|
+
|
461
|
+
test_dataset.to_a2d.each do |instance|
|
462
|
+
features = instance[0...-3]
|
463
|
+
|
464
|
+
old_revision_id = instance[-3].to_i
|
465
|
+
new_revision_id = instance[-2].to_i
|
466
|
+
|
467
|
+
params = classifier.classify(features, return_all_params: true)
|
468
|
+
class_short_name = Instances::CLASSES_SHORT[params[:class_index]]
|
469
|
+
|
470
|
+
must_be_inverted = @config.use_occ? && !(@classifier.classifier_instance.options =~ /#{Instances::VANDALISM}/)
|
471
|
+
confidence = must_be_inverted ? (1.0 - params[:confidence]) : params[:confidence]
|
472
|
+
|
473
|
+
classification[:"#{old_revision_id}-#{new_revision_id}"] = {
|
474
|
+
old_revision_id: old_revision_id,
|
475
|
+
new_revision_id: new_revision_id,
|
476
|
+
class: class_short_name,
|
477
|
+
confidence: confidence
|
478
|
+
}
|
479
|
+
end
|
480
|
+
|
481
|
+
classification
|
482
|
+
end
|
483
|
+
|
484
|
+
# Returns a hash for classification data from given classification file
|
485
|
+
def classification_hash(classification_file)
|
486
|
+
file = File.read(classification_file)
|
487
|
+
classification_samples = file.lines.to_a
|
488
|
+
classification_samples.shift # remove header line
|
489
|
+
|
490
|
+
classification = {}
|
491
|
+
|
492
|
+
classification_samples.each do |line|
|
493
|
+
line_parts = line.split(' ')
|
494
|
+
|
495
|
+
old_revision_id = line_parts[0].to_i
|
496
|
+
new_revision_id = line_parts[1].to_i
|
497
|
+
class_short = line_parts[2]
|
498
|
+
confidence = line_parts[3].to_f
|
499
|
+
|
500
|
+
classification[:"#{old_revision_id}-#{new_revision_id}"] = {
|
501
|
+
old_revision_id: old_revision_id,
|
502
|
+
new_revision_id: new_revision_id,
|
503
|
+
class: class_short,
|
504
|
+
confidence: confidence
|
505
|
+
}
|
506
|
+
end
|
507
|
+
|
508
|
+
classification
|
509
|
+
end
|
510
|
+
|
511
|
+
# Returns a hash for classification data from given ground truth file
|
512
|
+
def ground_truth_hash(ground_truth_file)
|
513
|
+
file = File.read(ground_truth_file)
|
514
|
+
ground_truth_samples = file.lines.to_a
|
515
|
+
|
516
|
+
ground_truth = {}
|
517
|
+
|
518
|
+
ground_truth_samples.each do |line|
|
519
|
+
line_parts = line.split(' ')
|
520
|
+
|
521
|
+
old_revision_id = line_parts[0].to_i
|
522
|
+
new_revision_id = line_parts[1].to_i
|
523
|
+
class_short = line_parts[2]
|
524
|
+
|
525
|
+
ground_truth[:"#{old_revision_id}-#{new_revision_id}"] = {
|
526
|
+
old_revision_id: old_revision_id,
|
527
|
+
new_revision_id: new_revision_id,
|
528
|
+
class: class_short
|
529
|
+
}
|
530
|
+
end
|
531
|
+
|
532
|
+
ground_truth
|
533
|
+
end
|
534
|
+
|
535
|
+
# Cross validates classifier over full dataset with <fold>-fold cross validation
|
536
|
+
def cross_validate_all_instances(fold)
|
537
|
+
begin
|
538
|
+
@classifier_instance.cross_validate(fold)
|
539
|
+
rescue => e
|
540
|
+
raise "Error while cross validation: #{e}"
|
541
|
+
end
|
542
|
+
end
|
543
|
+
|
544
|
+
# Cross validates classifier over equally distributed dataset with <fold>-fold cross validation
|
545
|
+
def cross_validate_equally_distributed(fold)
|
546
|
+
dirname = @config.output_base_directory
|
547
|
+
FileUtils.mkdir(dirname) unless Dir.exists?(dirname)
|
548
|
+
|
549
|
+
file_name = 'cross_validation_eq_distr.txt'
|
550
|
+
file_path = File.join(dirname, file_name)
|
551
|
+
|
552
|
+
puts "Writing to #{file_path}..."
|
553
|
+
result_file = File.open(file_path, 'a')
|
554
|
+
|
555
|
+
begin
|
556
|
+
time = Time.now.strftime("%Y-%m-%d %H:%M")
|
557
|
+
type = @config.classifier_type
|
558
|
+
options = @config.classifier_options || "default"
|
559
|
+
result_file.puts "\nCROSS VALIDATION - #{fold} fold (Classifier: #{type}, options: #{options} ) | #{time}"
|
560
|
+
result_file.puts "Features: \n\t#{@config.features.join("\n\t")}\n\n"
|
561
|
+
|
562
|
+
evaluations = []
|
563
|
+
|
564
|
+
times = 10
|
565
|
+
|
566
|
+
# run n times validation
|
567
|
+
(1..times).each do |i|
|
568
|
+
uniform_dataset = TrainingDataset.balanced_instances
|
569
|
+
|
570
|
+
print "\rcross validate dataset (equally distributed) ... #{i}/#{times} | instances: #{uniform_dataset.n_rows}"
|
571
|
+
@classifier_instance.set_data(uniform_dataset)
|
572
|
+
evaluations << @classifier_instance.cross_validate(fold)
|
573
|
+
|
574
|
+
print_evaluation_data(evaluations, result_file, i) if (i % (times / 10)) == 0
|
575
|
+
end
|
576
|
+
|
577
|
+
#evaluation_data_of(evaluations)
|
578
|
+
evaluations
|
579
|
+
rescue => e
|
580
|
+
raise "Error while cross validation for equally distributed instances: #{e}"
|
581
|
+
ensure
|
582
|
+
result_file.close
|
583
|
+
puts "\nThe evaluation results has been saved to #{file_path}"
|
584
|
+
end
|
585
|
+
end
|
586
|
+
|
587
|
+
# Returns the evaluation data average value hash of the given evaluations.
|
588
|
+
def evaluation_data_of(evaluations)
|
589
|
+
class_index = Instances::VANDALISM_CLASS_INDEX
|
590
|
+
total_count = evaluations.count.to_f
|
591
|
+
|
592
|
+
recall = evaluations.reduce(0.0) { |result, sample| result + sample.recall(class_index) } / total_count
|
593
|
+
precision = evaluations.reduce(0.0) { |result, sample| result + sample.precision(class_index) } / total_count
|
594
|
+
area_under_prc = evaluations.reduce(0.0) { |result, sample| result + sample.area_under_prc(class_index) } / total_count
|
595
|
+
|
596
|
+
{ precision: precision, recall: recall, area_under_prc: area_under_prc }
|
597
|
+
end
|
598
|
+
|
599
|
+
# Prints data to file
|
600
|
+
def print_evaluation_data(evaluations, file, index)
|
601
|
+
data = evaluation_data_of(evaluations)
|
602
|
+
file.puts "#{index}\tprecision: #{data[:precision]} | recall: #{data[:recall]} | Area under PRC: #{data[:area_under_prc]}"
|
603
|
+
end
|
604
|
+
end
|
605
|
+
end
|
606
|
+
end
|