wikipedia-vandalism_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +4 -0
- data/README.md +265 -0
- data/Rakefile +12 -0
- data/lib/java/LibSVM.jar +0 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/libsvm.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/functions/lib_svm.rb +15 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +25 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +17 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +18 -0
- data/lib/weka/filters/supervised/instance/smote.rb +22 -0
- data/lib/wikipedia.rb +51 -0
- data/lib/wikipedia/vandalism_detection.rb +30 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +18 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +69 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +186 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +321 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +27 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +75 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +606 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +40 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +89 -0
- data/lib/wikipedia/vandalism_detection/features.rb +67 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +54 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +65 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/user_reputation.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +22 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +130 -0
- data/lib/wikipedia/vandalism_detection/page.rb +88 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +52 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +69 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +43 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +367 -0
- data/lib/wikipedia/vandalism_detection/text.rb +18 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +303 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +12 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +21 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +22 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +12 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +15 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +12 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +97 -0
- data/spec/factories/edit.rb +20 -0
- data/spec/factories/page.rb +13 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/config.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/macros/file_reading.rb +7 -0
- data/spec/support/macros/test_configuration.rb +71 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +36 -0
- data/spec/vandalism_detection/classifier_spec.rb +317 -0
- data/spec/vandalism_detection/configuration_spec.rb +517 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +137 -0
- data/spec/vandalism_detection/evaluator_spec.rb +671 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +128 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +58 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +61 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +23 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +35 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/base_spec.rb +49 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +58 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +38 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +37 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +27 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +42 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +33 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +33 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +49 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +51 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +26 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +41 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +46 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +35 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +35 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +35 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +35 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +35 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +26 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +46 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +35 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +35 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +44 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +28 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +46 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +60 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +35 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +57 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +38 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +50 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +22 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +37 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/user_reputation_spec.rb +52 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +58 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +22 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +35 -0
- data/spec/vandalism_detection/instances_spec.rb +156 -0
- data/spec/vandalism_detection/page_parser_spec.rb +184 -0
- data/spec/vandalism_detection/page_spec.rb +135 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +115 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +231 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +264 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +72 -0
- data/spec/weka/classifiers/functions/lib_svm_spec.rb +38 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +76 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +40 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +40 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +6 -0
- data/wikipedia-vandalism_detection.gemspec +30 -0
- metadata +512 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wikipedia::VandalismDetection::Diff do
|
4
|
+
|
5
|
+
it "can deal with invalid byte sequences" do
|
6
|
+
text = "text \255".force_encoding('UTF-8')
|
7
|
+
expect { Wikipedia::VandalismDetection::Diff.new("#{text} a", "#{text} b") }.not_to raise_error
|
8
|
+
end
|
9
|
+
|
10
|
+
before do
|
11
|
+
@old_text = Wikipedia::VandalismDetection::Text.new "hello\nworld\nmy name is Luke\n"
|
12
|
+
@new_text = Wikipedia::VandalismDetection::Text.new "world\nhello\nmy name is Mr. Skywalker\n"
|
13
|
+
@diff = Wikipedia::VandalismDetection::Diff.new(@old_text, @new_text)
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "getting the inserted and removed words" do
|
17
|
+
|
18
|
+
it "can return the added words as array" do
|
19
|
+
inserted_words = @diff.inserted_words
|
20
|
+
|
21
|
+
expect(inserted_words).to be_an Array
|
22
|
+
expect(inserted_words.count).to eq 3
|
23
|
+
end
|
24
|
+
|
25
|
+
it "can return the removed words as array" do
|
26
|
+
removed_words = @diff.removed_words
|
27
|
+
|
28
|
+
expect(removed_words).to be_an Array
|
29
|
+
expect(removed_words.count).to eq 2
|
30
|
+
end
|
31
|
+
|
32
|
+
it "returns the right inserted words" do
|
33
|
+
expect(@diff.inserted_words).to eq ['hello', 'Mr.', 'Skywalker']
|
34
|
+
end
|
35
|
+
|
36
|
+
it "returns the right removed words" do
|
37
|
+
expect(@diff.removed_words).to eq ['hello', 'Luke']
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wikipedia::VandalismDetection::Edit do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@old_revision = build :old_revision
|
7
|
+
@new_revision = build :new_revision
|
8
|
+
@page_id = '1234'
|
9
|
+
|
10
|
+
@edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "has an old revision" do
|
14
|
+
expect(@edit.old_revision).to eq @old_revision
|
15
|
+
end
|
16
|
+
|
17
|
+
it "has a new revision" do
|
18
|
+
expect(@edit.new_revision).to eq @new_revision
|
19
|
+
end
|
20
|
+
|
21
|
+
it "can be build with its parent page referenced" do
|
22
|
+
page = build(:page, id: '1234', title: 'Page Title')
|
23
|
+
edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision, page: page)
|
24
|
+
expect(edit.page).to eq page
|
25
|
+
end
|
26
|
+
|
27
|
+
it "can be build with a page to get the id" do
|
28
|
+
page_id = '1234'
|
29
|
+
page = Wikipedia::VandalismDetection::Page.new
|
30
|
+
page.id = page_id
|
31
|
+
|
32
|
+
edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision, page: page)
|
33
|
+
expect(edit.page.id).to eq page_id
|
34
|
+
end
|
35
|
+
|
36
|
+
it "can be build with a page to get the title" do
|
37
|
+
page = Wikipedia::VandalismDetection::Page.new
|
38
|
+
page_title = 'Article'
|
39
|
+
page.title = page_title
|
40
|
+
|
41
|
+
edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision, page: page)
|
42
|
+
expect(edit.page.title).to eq page_title
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "exception handling" do
|
46
|
+
it "does not raise an error if page parameters are called" do
|
47
|
+
edit = Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision)
|
48
|
+
expect { edit.page.id }.not_to raise_error
|
49
|
+
end
|
50
|
+
|
51
|
+
it "raises no error if revisions are not sequent" do
|
52
|
+
expect { Wikipedia::VandalismDetection::Edit.new(@old_revision, @new_revision) }.not_to raise_error
|
53
|
+
end
|
54
|
+
|
55
|
+
it "raises an error if revisions are not sequent" do
|
56
|
+
expect { Wikipedia::VandalismDetection::Edit.new(@new_revision, @old_revision) }.to raise_exception ArgumentError
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#serialize" do
|
61
|
+
it "serializes the given parameters into a string" do
|
62
|
+
expect(@edit.serialize(:id, :text)).to eq "1,text 1\t2,text 2"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe "#inserted_words" do
|
67
|
+
it "returns the inserted words as array" do
|
68
|
+
old_revision = build(:old_revision, text: "")
|
69
|
+
new_revision = build(:new_revision, text: "inserted words")
|
70
|
+
edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
|
71
|
+
|
72
|
+
expect(edit.inserted_words).to eq ['inserted', 'words']
|
73
|
+
end
|
74
|
+
|
75
|
+
it "returns the uncleaned text inserted words as array" do
|
76
|
+
old_revision = build(:old_revision, text: "")
|
77
|
+
new_revision = build(:new_revision, text: "[[inserted words]]")
|
78
|
+
edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
|
79
|
+
|
80
|
+
expect(edit.inserted_words).to eq ['[[inserted', 'words]]']
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
describe "#inserted_text" do
|
85
|
+
it "returns the inserted text as Wikipedia::VandalismDetection::Text" do
|
86
|
+
old_revision = build(:old_revision, text: "")
|
87
|
+
new_revision = build(:new_revision, text: "inserted words")
|
88
|
+
edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
|
89
|
+
|
90
|
+
expect(edit.inserted_text).to eq Wikipedia::VandalismDetection::Text.new('inserted words')
|
91
|
+
end
|
92
|
+
|
93
|
+
it "returns the uncleaned text inserted text as Wikipedia::VadalismDetection::Text" do
|
94
|
+
old_revision = build(:old_revision, text: "")
|
95
|
+
new_revision = build(:new_revision, text: "[[inserted words]]")
|
96
|
+
edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
|
97
|
+
|
98
|
+
expect(edit.inserted_text).to eq Wikipedia::VandalismDetection::Text.new('[[inserted words]]')
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
describe "#removed_words" do
|
103
|
+
it "returns the removed words as array" do
|
104
|
+
old_revision = build(:old_revision, text: "removed words")
|
105
|
+
new_revision = build(:new_revision, text: "")
|
106
|
+
edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
|
107
|
+
|
108
|
+
expect(edit.removed_words).to eq ['removed', 'words']
|
109
|
+
end
|
110
|
+
|
111
|
+
it "returns the uncleaned text rremoved words as array" do
|
112
|
+
old_revision = build(:old_revision, text: "[[removed words]]")
|
113
|
+
new_revision = build(:new_revision, text: "")
|
114
|
+
edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
|
115
|
+
|
116
|
+
expect(edit.removed_words).to eq ['[[removed', 'words]]']
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
describe "#removed_text" do
|
121
|
+
it "returns the removed text as Wikipedia::VandalismDetection::Text" do
|
122
|
+
old_revision = build(:old_revision, text: "removed words")
|
123
|
+
new_revision = build(:new_revision, text: "")
|
124
|
+
edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
|
125
|
+
|
126
|
+
expect(edit.removed_text).to eq Wikipedia::VandalismDetection::Text.new('removed words')
|
127
|
+
end
|
128
|
+
|
129
|
+
it "returns the uncleaned text removed text as Wikipedia::VadalismDetection::Text" do
|
130
|
+
old_revision = build(:old_revision, text: "[[removed words]]")
|
131
|
+
new_revision = build(:new_revision, text: "")
|
132
|
+
edit = build(:edit, old_revision: old_revision, new_revision: new_revision)
|
133
|
+
|
134
|
+
expect(edit.removed_text).to eq Wikipedia::VandalismDetection::Text.new('[[removed words]]')
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,671 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wikipedia::VandalismDetection::Evaluator do
|
4
|
+
|
5
|
+
before do
|
6
|
+
use_test_configuration
|
7
|
+
@config = test_config
|
8
|
+
|
9
|
+
@training_arff_file = @config.training_output_arff_file
|
10
|
+
@test_arff_file = @config.test_output_arff_file
|
11
|
+
@build_dir = @config.output_base_directory
|
12
|
+
@test_classification_file = @config.test_output_classification_file
|
13
|
+
end
|
14
|
+
|
15
|
+
after do
|
16
|
+
# remove training arff file
|
17
|
+
if File.exists?(@training_arff_file)
|
18
|
+
File.delete(@training_arff_file)
|
19
|
+
FileUtils.rm_r(File.dirname @training_arff_file)
|
20
|
+
end
|
21
|
+
|
22
|
+
# remove test arff file
|
23
|
+
if File.exists?(@test_arff_file)
|
24
|
+
File.delete(@test_arff_file)
|
25
|
+
FileUtils.rm_r(File.dirname @test_arff_file)
|
26
|
+
end
|
27
|
+
|
28
|
+
# remove classification.txt
|
29
|
+
if File.exist?(@test_classification_file)
|
30
|
+
File.delete(@test_classification_file)
|
31
|
+
File.rm_r(File.dirname @test_classification_file)
|
32
|
+
end
|
33
|
+
|
34
|
+
# remove output base directory
|
35
|
+
if Dir.exists?(@build_dir)
|
36
|
+
FileUtils.rm_r(@build_dir)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "#initialize" do
|
41
|
+
|
42
|
+
it "raises an ArgumentError if classifier attr is not a Wikipedia::VandalismDetection::Classfier" do
|
43
|
+
expect { Wikipedia::VandalismDetection::Evaluator.new("") }.to raise_error ArgumentError
|
44
|
+
end
|
45
|
+
|
46
|
+
it "does not raise an error while appropriate initialization" do
|
47
|
+
classifier = Wikipedia::VandalismDetection::Classifier.new
|
48
|
+
expect { Wikipedia::VandalismDetection::Evaluator.new(classifier) }.not_to raise_error
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
before do
|
53
|
+
classifier = Wikipedia::VandalismDetection::Classifier.new
|
54
|
+
@evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "#test_performance_curves" do
|
58
|
+
|
59
|
+
before do
|
60
|
+
@classification = {
|
61
|
+
:"1-2" => {
|
62
|
+
old_revision_id: 1,
|
63
|
+
new_revision_id: 2,
|
64
|
+
class: "R",
|
65
|
+
confidence: 0.0
|
66
|
+
},
|
67
|
+
:"2-3" => {
|
68
|
+
old_revision_id: 2,
|
69
|
+
new_revision_id: 3,
|
70
|
+
class: "R",
|
71
|
+
confidence: 0.3
|
72
|
+
},
|
73
|
+
:"3-4" => {
|
74
|
+
old_revision_id: 3,
|
75
|
+
new_revision_id: 4,
|
76
|
+
class: "V",
|
77
|
+
confidence: 0.8
|
78
|
+
},
|
79
|
+
:"4-5" => {
|
80
|
+
old_revision_id: 4,
|
81
|
+
new_revision_id: 5,
|
82
|
+
class: "V",
|
83
|
+
confidence: 1.0
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
# ground truth has one sample more to represent fall-out samples while feature calculation
|
88
|
+
# (e.g. redirects are not considered)
|
89
|
+
@ground_truth = {
|
90
|
+
:"0-1" => { # this is a sample that is not used!
|
91
|
+
old_revision_id: 0,
|
92
|
+
new_revision_id: 1,
|
93
|
+
class: "R"
|
94
|
+
},
|
95
|
+
:"1-2" => {
|
96
|
+
old_revision_id: 1,
|
97
|
+
new_revision_id: 2,
|
98
|
+
class: "R"
|
99
|
+
},
|
100
|
+
:"2-3" => {
|
101
|
+
old_revision_id: 2,
|
102
|
+
new_revision_id: 3,
|
103
|
+
class: "V"
|
104
|
+
},
|
105
|
+
:"3-4" => {
|
106
|
+
old_revision_id: 3,
|
107
|
+
new_revision_id: 4,
|
108
|
+
class: "R"
|
109
|
+
},
|
110
|
+
:"4-5" => {
|
111
|
+
old_revision_id: 4,
|
112
|
+
new_revision_id: 5,
|
113
|
+
class: "V"
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
@sample_count = 10
|
118
|
+
|
119
|
+
@curve_data = @evaluator.test_performance_curves(@ground_truth, @classification, @sample_count)
|
120
|
+
end
|
121
|
+
|
122
|
+
it "returns a Hash" do
|
123
|
+
expect(@curve_data).to be_a Hash
|
124
|
+
end
|
125
|
+
|
126
|
+
[:recalls, :precisions,:fp_rates, :tp_rates, :pr_auc, :roc_auc].each do |attribute|
|
127
|
+
it "returns a Hash including #{attribute}" do
|
128
|
+
expect(@curve_data).to have_key(attribute)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
describe "#predictive_values" do
|
133
|
+
|
134
|
+
before do
|
135
|
+
@threshold = 0.5
|
136
|
+
@predictive_values = @evaluator.predictive_values(@ground_truth, @classification, @threshold)
|
137
|
+
end
|
138
|
+
|
139
|
+
it "returns a Hash" do
|
140
|
+
expect(@predictive_values).to be_a Hash
|
141
|
+
end
|
142
|
+
|
143
|
+
[
|
144
|
+
{ threshold: 0.0, result: {tp: 2, fp: 2, tn: 0, fn: 0} },
|
145
|
+
{ threshold: 0.3, result: {tp: 1, fp: 1, tn: 1, fn: 1} },
|
146
|
+
{ threshold: 0.5, result: {tp: 1, fp: 1, tn: 1, fn: 1} },
|
147
|
+
{ threshold: 0.8, result: {tp: 1, fp: 1, tn: 1, fn: 1} },
|
148
|
+
{ threshold: 0.9, result: {tp: 1, fp: 0, tn: 2, fn: 1} },
|
149
|
+
{ threshold: 1.0, result: {tp: 0, fp: 0, tn: 2, fn: 2} }
|
150
|
+
].each do |values|
|
151
|
+
it "returns the right values for threshold #{values[:threshold]}" do
|
152
|
+
predictive_values = @evaluator.predictive_values(@ground_truth, @classification, values[:threshold])
|
153
|
+
expect(predictive_values).to eq values[:result]
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
describe "#sort_curve_values" do
|
159
|
+
|
160
|
+
before do
|
161
|
+
@x = [0.7, 0.4, 0.8, 0.4, 0.7]
|
162
|
+
@y = [0.6, 0.8, 0.2, 0.6, 0.6]
|
163
|
+
|
164
|
+
@x_sorted = [0.4, 0.4, 0.7, 0.8]
|
165
|
+
@y_sorted = [0.8, 0.6, 0.6, 0.2]
|
166
|
+
end
|
167
|
+
|
168
|
+
it "returns the unique sorted input values" do
|
169
|
+
hash = { x: @x_sorted, y: @y_sorted }
|
170
|
+
sorted = @evaluator.sort_curve_values(@x, @y)
|
171
|
+
|
172
|
+
expect(sorted).to eq hash
|
173
|
+
end
|
174
|
+
|
175
|
+
it "adds start values if given" do
|
176
|
+
start_values = { x: -1.0, y: -2.0 }
|
177
|
+
hash = { x: @x_sorted.unshift(start_values[:x]), y: @y_sorted.unshift(start_values[:y])}
|
178
|
+
sorted = @evaluator.sort_curve_values(@x, @y, start_values)
|
179
|
+
|
180
|
+
expect(sorted).to eq hash
|
181
|
+
end
|
182
|
+
|
183
|
+
it "adds x start value if only one value given" do
|
184
|
+
start_values = { x: -1.0 }
|
185
|
+
hash = { x: @x_sorted.unshift(start_values[:x]), y: @y_sorted.unshift(@y_sorted.first) }
|
186
|
+
sorted = @evaluator.sort_curve_values(@x, @y, start_values)
|
187
|
+
|
188
|
+
expect(sorted).to eq hash
|
189
|
+
end
|
190
|
+
|
191
|
+
it "adds y start value if only one value given" do
|
192
|
+
start_values = { y: -2.0 }
|
193
|
+
hash = { x: @x_sorted.unshift(@x_sorted.first), y: @y_sorted.unshift(start_values[:y]) }
|
194
|
+
sorted = @evaluator.sort_curve_values(@x, @y, start_values)
|
195
|
+
|
196
|
+
expect(sorted).to eq hash
|
197
|
+
end
|
198
|
+
|
199
|
+
it "adds end values if given" do
|
200
|
+
end_values = { x: -1.0, y: -2.0 }
|
201
|
+
hash = { x: @x_sorted.push(end_values[:x]), y: @y_sorted.push(end_values[:y]) }
|
202
|
+
sorted = @evaluator.sort_curve_values(@x, @y, nil, end_values)
|
203
|
+
|
204
|
+
expect(sorted).to eq hash
|
205
|
+
end
|
206
|
+
|
207
|
+
it "adds y end values if only one value is given" do
|
208
|
+
end_values = {y: -2.0 }
|
209
|
+
hash = { x: @x_sorted.push(@x_sorted.last), y: @y_sorted.push(end_values[:y]) }
|
210
|
+
sorted = @evaluator.sort_curve_values(@x, @y, nil, end_values)
|
211
|
+
|
212
|
+
expect(sorted).to eq hash
|
213
|
+
end
|
214
|
+
|
215
|
+
it "adds x end values if only one value is given" do
|
216
|
+
end_values = {x: -1.0 }
|
217
|
+
hash = { x: @x_sorted.push(end_values[:x]), y: @y_sorted.push(@y_sorted.last) }
|
218
|
+
sorted = @evaluator.sort_curve_values(@x, @y, nil, end_values)
|
219
|
+
|
220
|
+
expect(sorted).to eq hash
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
describe "#area_under_curve" do
|
225
|
+
|
226
|
+
before do
|
227
|
+
@pr_auc = @evaluator.area_under_curve(@curve_data[:precisions], @curve_data[:precisions])
|
228
|
+
@roc_auc = @evaluator.area_under_curve(@curve_data[:fp_rates], @curve_data[:tp_rates])
|
229
|
+
end
|
230
|
+
|
231
|
+
it "returns a numeric value for pr_auc" do
|
232
|
+
expect(@pr_auc).to be_a Numeric
|
233
|
+
end
|
234
|
+
|
235
|
+
it "returns a numeric value between 0.0 & 1.0 for pr_auc" do
|
236
|
+
is_between_zero_and_one = (@pr_auc >= 0.0 && @pr_auc <= 1.0)
|
237
|
+
expect(is_between_zero_and_one).to be true
|
238
|
+
end
|
239
|
+
|
240
|
+
it "returns a numeric value for roc_auc" do
|
241
|
+
expect(@roc_auc).to be_a Numeric
|
242
|
+
end
|
243
|
+
|
244
|
+
it "returns a numeric value between 0.0 & 1.0 for roc_auc" do
|
245
|
+
is_between_zero_and_one = @roc_auc >= 0.0 && @roc_auc <= 1.0
|
246
|
+
expect(is_between_zero_and_one).to be true
|
247
|
+
end
|
248
|
+
|
249
|
+
[
|
250
|
+
{ x: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0], y: [1.0, 0.8, 0.6, 0.4, 0.2, 0.0], auc: 0.5 }
|
251
|
+
].each do |data|
|
252
|
+
it "returns the right values" do
|
253
|
+
x = data[:x]
|
254
|
+
y = data[:y]
|
255
|
+
auc = data[:auc]
|
256
|
+
|
257
|
+
expect(@evaluator.area_under_curve(x, y)).to eq auc
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
describe "#create_testcorpus_classification_file!" do
|
265
|
+
|
266
|
+
before do
|
267
|
+
@ground_truth = { # see resources file ground_truth.csv
|
268
|
+
:"0-1" => { # this is a sample that is not used!
|
269
|
+
old_revision_id: 0,
|
270
|
+
new_revision_id: 1,
|
271
|
+
class: "R"
|
272
|
+
},
|
273
|
+
:"307084144-326873205" => {
|
274
|
+
old_revision_id: 307084144,
|
275
|
+
new_revision_id: 326873205,
|
276
|
+
class: "R"
|
277
|
+
},
|
278
|
+
:"326471754-326978767" => {
|
279
|
+
old_revision_id: 326471754,
|
280
|
+
new_revision_id: 326978767,
|
281
|
+
class: "V"
|
282
|
+
},
|
283
|
+
:"328774035-328774110" => {
|
284
|
+
old_revision_id: 328774035,
|
285
|
+
new_revision_id: 328774110,
|
286
|
+
class: "R"
|
287
|
+
}
|
288
|
+
}
|
289
|
+
end
|
290
|
+
|
291
|
+
it "raises an argument error if ground_truth param is nil" do
|
292
|
+
expect { @evaluator.create_testcorpus_classification_file!(@test_classification_file, nil) }.to raise_error ArgumentError
|
293
|
+
end
|
294
|
+
|
295
|
+
it "creates a classification file in the base output directory" do
|
296
|
+
expect(File.exists?(@test_classification_file)).to be false
|
297
|
+
@evaluator.create_testcorpus_classification_file!(@test_classification_file, @ground_truth)
|
298
|
+
expect(File.exists?(@test_classification_file)).to be true
|
299
|
+
end
|
300
|
+
|
301
|
+
it "creates a file with an appropriate header" do
|
302
|
+
@evaluator.create_testcorpus_classification_file!(@test_classification_file, @ground_truth)
|
303
|
+
content = File.open(@test_classification_file, 'r')
|
304
|
+
|
305
|
+
features = Core::Parser.parse_ARFF(@test_arff_file).enumerate_attributes.to_a.map { |attr| attr.name.upcase }[0...-2]
|
306
|
+
proposed_header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *features]
|
307
|
+
header = content.lines.first.split(' ')
|
308
|
+
|
309
|
+
expect(header).to eq proposed_header
|
310
|
+
end
|
311
|
+
|
312
|
+
it "creates a file with an appropriate number of lines" do
|
313
|
+
@evaluator.create_testcorpus_classification_file!(@test_classification_file, @ground_truth)
|
314
|
+
content = File.open(@test_classification_file, 'r')
|
315
|
+
|
316
|
+
samples_count = Core::Parser.parse_ARFF(@test_arff_file).n_rows
|
317
|
+
|
318
|
+
lines = content.lines.to_a
|
319
|
+
lines.shift # remove header
|
320
|
+
expect(lines.count).to eq samples_count
|
321
|
+
end
|
322
|
+
|
323
|
+
it "has the short class names as class value" do
|
324
|
+
@evaluator.create_testcorpus_classification_file!(@test_classification_file, @ground_truth)
|
325
|
+
content = File.open(@test_classification_file, 'r')
|
326
|
+
|
327
|
+
lines = content.lines.to_a
|
328
|
+
lines.shift # remove header
|
329
|
+
short_classes = Wikipedia::VandalismDetection::Instances::CLASSES_SHORT
|
330
|
+
vandalism_index = Wikipedia::VandalismDetection::Instances::VANDALISM_CLASS_INDEX
|
331
|
+
regular_index = Wikipedia::VandalismDetection::Instances::REGULAR_CLASS_INDEX
|
332
|
+
missing_index = Wikipedia::VandalismDetection::Instances::NOT_KNOWN_INDEX
|
333
|
+
|
334
|
+
names = [short_classes[regular_index], short_classes[vandalism_index], short_classes[missing_index]]
|
335
|
+
|
336
|
+
lines.each do |line|
|
337
|
+
class_name = line.split[2]
|
338
|
+
expect(names).to include class_name
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
343
|
+
describe "#evaluate_testcorpus_classification" do
|
344
|
+
|
345
|
+
describe "exceptions" do
|
346
|
+
|
347
|
+
it "raises an GroundTruthFileNotConfiguredError unless a ground thruth file is configured" do
|
348
|
+
config = test_config
|
349
|
+
config.instance_variable_set :@test_corpus_ground_truth_file, nil
|
350
|
+
use_configuration(config)
|
351
|
+
|
352
|
+
classifier = Wikipedia::VandalismDetection::Classifier.new
|
353
|
+
evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
354
|
+
|
355
|
+
expect { evaluator.evaluate_testcorpus_classification }.to raise_error \
|
356
|
+
Wikipedia::VandalismDetection::GroundTruthFileNotConfiguredError
|
357
|
+
end
|
358
|
+
|
359
|
+
it "raises an GroundTruthFileNotFoundError unless the ground thruth file can be found" do
|
360
|
+
config = test_config
|
361
|
+
config.instance_variable_set :@test_corpus_ground_truth_file, 'false-file-name.txt'
|
362
|
+
use_configuration(config)
|
363
|
+
|
364
|
+
classifier = Wikipedia::VandalismDetection::Classifier.new
|
365
|
+
evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
366
|
+
|
367
|
+
expect { evaluator.evaluate_testcorpus_classification }.to raise_error \
|
368
|
+
Wikipedia::VandalismDetection::GroundTruthFileNotFoundError
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
it "returns a performance values Hash" do
|
373
|
+
performance_values = @evaluator.evaluate_testcorpus_classification(sample_count: @sample_count)
|
374
|
+
expect(performance_values).to be_a Hash
|
375
|
+
end
|
376
|
+
|
377
|
+
[ :fp_rates,
|
378
|
+
:tp_rates,
|
379
|
+
:precisions,
|
380
|
+
:recalls,
|
381
|
+
:pr_auc,
|
382
|
+
:roc_auc,
|
383
|
+
:total_precision,
|
384
|
+
:total_recall
|
385
|
+
].each do |attr|
|
386
|
+
it "returns a performance values Hash with property'#{attr}'" do
|
387
|
+
performance_values = @evaluator.evaluate_testcorpus_classification(sample_count: @sample_count)
|
388
|
+
expect(performance_values[attr]).to_not be_nil
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
it "runs the classification file creation" do
|
393
|
+
expect(File.exists?(@test_classification_file)).to be false
|
394
|
+
@evaluator.evaluate_testcorpus_classification
|
395
|
+
expect(File.exists?(@test_classification_file)).to be true
|
396
|
+
end
|
397
|
+
|
398
|
+
it "overwrites the old classification file" do
|
399
|
+
config = test_config
|
400
|
+
|
401
|
+
config.instance_variable_set(:@features, ['comment length'])
|
402
|
+
use_configuration(config)
|
403
|
+
|
404
|
+
classifier = Wikipedia::VandalismDetection::Classifier.new
|
405
|
+
evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
406
|
+
|
407
|
+
evaluator.evaluate_testcorpus_classification
|
408
|
+
content_old = File.read(@test_classification_file)
|
409
|
+
|
410
|
+
config.instance_variable_set(:@features, ['anonymity'])
|
411
|
+
use_configuration(config)
|
412
|
+
|
413
|
+
classifier = Wikipedia::VandalismDetection::Classifier.new
|
414
|
+
evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
415
|
+
|
416
|
+
evaluator.evaluate_testcorpus_classification
|
417
|
+
content_new = File.read(@test_classification_file)
|
418
|
+
|
419
|
+
expect(content_old).to_not eq content_new
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
describe "#cross_validate" do
|
424
|
+
|
425
|
+
it "returns an evaluation object" do
|
426
|
+
evaluation = @evaluator.cross_validate
|
427
|
+
expect(evaluation.class).to eq Java::WekaClassifiers::Evaluation
|
428
|
+
end
|
429
|
+
|
430
|
+
it "can cross validates the classifier" do
|
431
|
+
expect { @evaluator.cross_validate }.not_to raise_error
|
432
|
+
end
|
433
|
+
|
434
|
+
it "can cross validates the classifier with equally distributed samples" do
|
435
|
+
expect { @evaluator.cross_validate(equally_distributed: true) }.not_to raise_error
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
describe "#curve_data" do
|
440
|
+
|
441
|
+
describe "all samples" do
|
442
|
+
|
443
|
+
before do
|
444
|
+
@data = @evaluator.curve_data
|
445
|
+
end
|
446
|
+
|
447
|
+
it "returns a Hash" do
|
448
|
+
expect(@data).to be_a Hash
|
449
|
+
end
|
450
|
+
|
451
|
+
it "includes precision curve data" do
|
452
|
+
expect(@data[:precision]).to be_an Array
|
453
|
+
end
|
454
|
+
|
455
|
+
it "includes recall curve data" do
|
456
|
+
expect(@data[:recall]).to be_an Array
|
457
|
+
end
|
458
|
+
|
459
|
+
it "includes area_under_prc data" do
|
460
|
+
expect(@data[:area_under_prc]).to be_a Numeric
|
461
|
+
end
|
462
|
+
|
463
|
+
it "has non-empty :precision Array contents" do
|
464
|
+
expect(@data[:precision]).to_not be_empty
|
465
|
+
end
|
466
|
+
|
467
|
+
it "has non-empty :recall Array contents" do
|
468
|
+
expect(@data[:recall]).to_not be_empty
|
469
|
+
end
|
470
|
+
end
|
471
|
+
|
472
|
+
describe "equally distributed samples" do
|
473
|
+
|
474
|
+
before do
|
475
|
+
@data = @evaluator.curve_data(equally_distributed: true)
|
476
|
+
end
|
477
|
+
|
478
|
+
it "returns a Hash" do
|
479
|
+
expect(@data).to be_a Hash
|
480
|
+
end
|
481
|
+
|
482
|
+
it "includes precision curve data" do
|
483
|
+
expect(@data[:precision]).to be_a Array
|
484
|
+
end
|
485
|
+
|
486
|
+
it "includes recall curve data" do
|
487
|
+
expect(@data[:recall]).to be_a Array
|
488
|
+
end
|
489
|
+
|
490
|
+
it "includes area_under_prc data" do
|
491
|
+
expect(@data[:area_under_prc]).to be_a Numeric
|
492
|
+
end
|
493
|
+
|
494
|
+
it "has non-empty :precision Array contents" do
|
495
|
+
expect(@data[:precision]).to_not be_empty
|
496
|
+
end
|
497
|
+
|
498
|
+
it "has non-empty :recall Array contents" do
|
499
|
+
expect(@data[:recall]).to_not be_empty
|
500
|
+
end
|
501
|
+
end
|
502
|
+
end
|
503
|
+
|
504
|
+
describe "#feature_analysis" do
|
505
|
+
|
506
|
+
it "returns a hash" do
|
507
|
+
analysis = @evaluator.feature_analysis(sample_count: 100)
|
508
|
+
expect(analysis).to be_a Hash
|
509
|
+
end
|
510
|
+
|
511
|
+
it "returns a hash with feature count size" do
|
512
|
+
analysis = @evaluator.feature_analysis(sample_count: 100)
|
513
|
+
expect(analysis.count).to eq @config.features.count
|
514
|
+
end
|
515
|
+
|
516
|
+
it "returns a hash with sample count number of data hashes" do
|
517
|
+
sample_count = 5
|
518
|
+
analysis = @evaluator.feature_analysis(sample_count: sample_count)
|
519
|
+
|
520
|
+
analysis.each do |key, threshold_hash|
|
521
|
+
expect(threshold_hash.count).to eq sample_count
|
522
|
+
end
|
523
|
+
end
|
524
|
+
|
525
|
+
it "returns the four predictive values in each features threshold hash" do
|
526
|
+
analysis = @evaluator.feature_analysis
|
527
|
+
threshold_hash = analysis[@config.features.first][0.0]
|
528
|
+
|
529
|
+
expect(threshold_hash).to have_key(:fp)
|
530
|
+
expect(threshold_hash).to have_key(:fn)
|
531
|
+
expect(threshold_hash).to have_key(:tp)
|
532
|
+
expect(threshold_hash).to have_key(:tn)
|
533
|
+
end
|
534
|
+
end
|
535
|
+
|
536
|
+
describe "#full_analysis" do
|
537
|
+
|
538
|
+
it "returns a hash" do
|
539
|
+
analysis = @evaluator.full_analysis(sample_count: 100)
|
540
|
+
expect(analysis).to be_a Hash
|
541
|
+
end
|
542
|
+
|
543
|
+
it "returns a hash with smaple count number of threshold hashes" do
|
544
|
+
sample_count = 5
|
545
|
+
analysis = @evaluator.full_analysis(sample_count: sample_count)
|
546
|
+
expect(analysis.count).to eq sample_count
|
547
|
+
end
|
548
|
+
|
549
|
+
it "returns the four predictive values in each features threshold hash" do
|
550
|
+
analysis = @evaluator.full_analysis
|
551
|
+
threshold_hash = analysis[0.0]
|
552
|
+
|
553
|
+
expect(threshold_hash).to have_key(:fp)
|
554
|
+
expect(threshold_hash).to have_key(:fn)
|
555
|
+
expect(threshold_hash).to have_key(:tp)
|
556
|
+
expect(threshold_hash).to have_key(:tn)
|
557
|
+
end
|
558
|
+
end
|
559
|
+
|
560
|
+
describe "#true_positive?" do
|
561
|
+
before do
|
562
|
+
@vandalism = Wikipedia::VandalismDetection::Instances::VANDALISM_SHORT
|
563
|
+
@regular = Wikipedia::VandalismDetection::Instances::REGULAR_SHORT
|
564
|
+
@threshold = 0.7
|
565
|
+
end
|
566
|
+
|
567
|
+
it "returns true if the given confidence is higher than a threshold regarding the ground truth 'V'" do
|
568
|
+
expect(Wikipedia::VandalismDetection::Evaluator.true_positive?(@vandalism, 0.8, @threshold)).to be true
|
569
|
+
end
|
570
|
+
|
571
|
+
it "returns false if the given confidence is lower than a threshold regarding the ground truth 'V'" do
|
572
|
+
expect(Wikipedia::VandalismDetection::Evaluator.true_positive?(@vandalism, 0.5, @threshold)).to be false
|
573
|
+
end
|
574
|
+
|
575
|
+
it "returns false for the same confidence and threshold if ground truth is 'V'" do
|
576
|
+
expect(Wikipedia::VandalismDetection::Evaluator.true_positive?(@vandalism, @threshold, @threshold)).to be false
|
577
|
+
end
|
578
|
+
|
579
|
+
it "returns false if the given confidence is higher than a threshold regarding the ground truth 'R'" do
|
580
|
+
expect( Wikipedia::VandalismDetection::Evaluator.true_positive?(@regular, 0.8, @threshold)).to be false
|
581
|
+
end
|
582
|
+
|
583
|
+
it "returns false if the given confidence is lower than a threshold regarding the ground truth 'R'" do
|
584
|
+
expect(Wikipedia::VandalismDetection::Evaluator.true_positive?(@regular, 0.5, @threshold)).to be false
|
585
|
+
end
|
586
|
+
end
|
587
|
+
|
588
|
+
describe "#true_negative?" do
|
589
|
+
before do
|
590
|
+
@vandalism = Wikipedia::VandalismDetection::Instances::VANDALISM_SHORT
|
591
|
+
@regular = Wikipedia::VandalismDetection::Instances::REGULAR_SHORT
|
592
|
+
@threshold = 0.7
|
593
|
+
end
|
594
|
+
|
595
|
+
it "returns true if the given confidence is lower than a threshold regarding the ground truth 'R'" do
|
596
|
+
expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@regular, 0.5, @threshold)).to be true
|
597
|
+
end
|
598
|
+
|
599
|
+
it "returns false if the given confidence is higher than a threshold regarding the ground truth 'R'" do
|
600
|
+
expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@regular, 0.8, @threshold)).to be false
|
601
|
+
end
|
602
|
+
|
603
|
+
it "returns false for the same confidence and threshold if ground truth is 'R'" do
|
604
|
+
expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@regular, @threshold, @threshold)).to be false
|
605
|
+
end
|
606
|
+
|
607
|
+
it "returns false if the given confidence is lower than a threshold regarding the ground truth 'V'" do
|
608
|
+
expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@vandalism, 0.5, @threshold)).to be false
|
609
|
+
end
|
610
|
+
|
611
|
+
it "returns false if the given confidence is higher than a threshold regarding the ground truth 'V'" do
|
612
|
+
expect(Wikipedia::VandalismDetection::Evaluator.true_negative?(@vandalism, 0.8, @threshold)).to be false
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
describe "#false_positive?" do
|
617
|
+
before do
|
618
|
+
@vandalism = Wikipedia::VandalismDetection::Instances::VANDALISM_SHORT
|
619
|
+
@regular = Wikipedia::VandalismDetection::Instances::REGULAR_SHORT
|
620
|
+
@threshold = 0.7
|
621
|
+
end
|
622
|
+
|
623
|
+
it "returns true if the given confidence is higher than a threshold regarding the ground truth 'R'" do
|
624
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@regular, 0.8, @threshold)).to be true
|
625
|
+
end
|
626
|
+
|
627
|
+
it "returns false if the given confidence is lower than a threshold regarding the ground truth 'R'" do
|
628
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@regular, 0.5, @threshold)).to be false
|
629
|
+
end
|
630
|
+
|
631
|
+
it "returns true for the same confidence and threshold if ground truth is 'R'" do
|
632
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@regular, @threshold, @threshold)).to be true
|
633
|
+
end
|
634
|
+
|
635
|
+
it "returns false if the given confidence is higher than a threshold regarding the ground truth 'V'" do
|
636
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@vandalism, 0.8, @threshold)).to be false
|
637
|
+
end
|
638
|
+
|
639
|
+
it "returns false if the given confidence is lower than a threshold regarding the ground truth 'V'" do
|
640
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_positive?(@vandalism, 0.5, @threshold)).to be false
|
641
|
+
end
|
642
|
+
end
|
643
|
+
|
644
|
+
describe "#false_negative?" do
|
645
|
+
before do
|
646
|
+
@vandalism = Wikipedia::VandalismDetection::Instances::VANDALISM_SHORT
|
647
|
+
@regular = Wikipedia::VandalismDetection::Instances::REGULAR_SHORT
|
648
|
+
@threshold = 0.7
|
649
|
+
end
|
650
|
+
|
651
|
+
it "returns true if the given confidence is lower than a threshold regarding the ground truth 'V'" do
|
652
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@vandalism, 0.5, @threshold)).to be true
|
653
|
+
end
|
654
|
+
|
655
|
+
it "returns false if the given confidence is higher than a threshold regarding the ground truth 'V'" do
|
656
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@vandalism, 0.8, @threshold)).to be false
|
657
|
+
end
|
658
|
+
|
659
|
+
it "returns true for the same confidence and threshold if ground truth is 'V'" do
|
660
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@vandalism, @threshold, @threshold)).to be true
|
661
|
+
end
|
662
|
+
|
663
|
+
it "returns false if the given confidence is lower than a threshold regarding the ground truth 'R'" do
|
664
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@regular, 0.5, @threshold)).to be false
|
665
|
+
end
|
666
|
+
|
667
|
+
it "returns false if the given confidence is higher than a threshold regarding the ground truth 'R'" do
|
668
|
+
expect(Wikipedia::VandalismDetection::Evaluator.false_negative?(@regular, 0.8, @threshold)).to be false
|
669
|
+
end
|
670
|
+
end
|
671
|
+
end
|