wikipedia-vandalism_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +4 -0
- data/README.md +265 -0
- data/Rakefile +12 -0
- data/lib/java/LibSVM.jar +0 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/libsvm.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/functions/lib_svm.rb +15 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +25 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +17 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +18 -0
- data/lib/weka/filters/supervised/instance/smote.rb +22 -0
- data/lib/wikipedia.rb +51 -0
- data/lib/wikipedia/vandalism_detection.rb +30 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +18 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +69 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +186 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +321 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +27 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +75 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +606 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +40 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +89 -0
- data/lib/wikipedia/vandalism_detection/features.rb +67 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +54 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +65 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +24 -0
- data/lib/wikipedia/vandalism_detection/features/user_reputation.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +22 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +130 -0
- data/lib/wikipedia/vandalism_detection/page.rb +88 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +52 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +69 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +43 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +367 -0
- data/lib/wikipedia/vandalism_detection/text.rb +18 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +303 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +12 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +21 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +22 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +12 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +15 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +12 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +97 -0
- data/spec/factories/edit.rb +20 -0
- data/spec/factories/page.rb +13 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/config.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/macros/file_reading.rb +7 -0
- data/spec/support/macros/test_configuration.rb +71 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +36 -0
- data/spec/vandalism_detection/classifier_spec.rb +317 -0
- data/spec/vandalism_detection/configuration_spec.rb +517 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +137 -0
- data/spec/vandalism_detection/evaluator_spec.rb +671 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +128 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +58 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +61 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +23 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +35 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/base_spec.rb +49 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +58 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +38 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +37 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +27 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +34 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +42 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +33 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +33 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +49 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +51 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +26 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +41 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +46 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +35 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +35 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +35 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +35 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +35 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +26 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +46 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +35 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +35 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +44 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +28 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +46 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +60 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +59 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +35 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +57 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +38 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +50 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +22 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +37 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +35 -0
- data/spec/vandalism_detection/features/user_reputation_spec.rb +52 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +36 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +58 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +22 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +35 -0
- data/spec/vandalism_detection/instances_spec.rb +156 -0
- data/spec/vandalism_detection/page_parser_spec.rb +184 -0
- data/spec/vandalism_detection/page_spec.rb +135 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +115 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +231 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +264 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +72 -0
- data/spec/weka/classifiers/functions/lib_svm_spec.rb +38 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +76 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +40 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +40 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +6 -0
- data/wikipedia-vandalism_detection.gemspec +30 -0
- metadata +512 -0
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wikipedia::VandalismDetection::Page do
|
4
|
+
|
5
|
+
describe "constants" do
|
6
|
+
it "has a START_TAG constant" do
|
7
|
+
expect(Wikipedia::VandalismDetection::Page::START_TAG).to eq '<page>'
|
8
|
+
end
|
9
|
+
|
10
|
+
it "has an END_Tag constant" do
|
11
|
+
expect(Wikipedia::VandalismDetection::Page::END_TAG).to eq '</page>'
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
before do
|
16
|
+
@page = Wikipedia::VandalismDetection::Page.new
|
17
|
+
end
|
18
|
+
|
19
|
+
it "has a title" do
|
20
|
+
expect(@page).to respond_to :title
|
21
|
+
end
|
22
|
+
|
23
|
+
it "has an id" do
|
24
|
+
expect(@page).to respond_to :id
|
25
|
+
end
|
26
|
+
|
27
|
+
it "has revisions" do
|
28
|
+
expect(@page.revisions).to be_a Hash
|
29
|
+
end
|
30
|
+
|
31
|
+
it "has revisions with default {}" do
|
32
|
+
expect(@page.revisions).to be_empty
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#edits" do
|
36
|
+
|
37
|
+
it {should respond_to :edits }
|
38
|
+
|
39
|
+
it "returns an empty array if no revision is available" do
|
40
|
+
expect(@page.revisions).to be_empty
|
41
|
+
expect(@page.edits).to be_an(Array)
|
42
|
+
expect(@page.edits).to be_empty
|
43
|
+
end
|
44
|
+
|
45
|
+
it "resets the @revision_added flag to false" do
|
46
|
+
@page.add_revision build(:empty_revision, id: '1')
|
47
|
+
@page.edits
|
48
|
+
expect(@page.instance_variable_get(:@update_edits)).to be false
|
49
|
+
end
|
50
|
+
|
51
|
+
it "computes edits from the page's revisions" do
|
52
|
+
@page.add_revision build(:empty_revision, id: '1')
|
53
|
+
@page.add_revision build(:empty_revision, id: '3', parent_id: "2")
|
54
|
+
@page.add_revision build(:empty_revision, id: '2', parent_id: "1")
|
55
|
+
|
56
|
+
expect(@page.edits.count).to eq 2
|
57
|
+
end
|
58
|
+
|
59
|
+
it "computes edits of which each holds the parent page as reference" do
|
60
|
+
@page.id = '1234'
|
61
|
+
@page.title = 'Article'
|
62
|
+
|
63
|
+
@page.add_revision build(:empty_revision, id: '1')
|
64
|
+
@page.add_revision build(:empty_revision, id: '3', parent_id: "2")
|
65
|
+
@page.add_revision build(:empty_revision, id: '2', parent_id: "1")
|
66
|
+
|
67
|
+
@page.edits.each do |edit|
|
68
|
+
expect(edit.page).to eq @page
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe "#add_revision" do
|
74
|
+
|
75
|
+
it { should respond_to :add_revision }
|
76
|
+
|
77
|
+
it "takes a revision and adds it to revisions" do
|
78
|
+
revision = build :empty_revision
|
79
|
+
expect { @page.add_revision(revision) }.to change(@page.revisions, :count).by(1)
|
80
|
+
end
|
81
|
+
|
82
|
+
it "sets the @update_edits flag to true after adding a revision" do
|
83
|
+
revision = build :empty_revision
|
84
|
+
@page.add_revision(revision)
|
85
|
+
expect(@page.instance_variable_get(:@update_edits)).to be true
|
86
|
+
end
|
87
|
+
|
88
|
+
it "sets the @update_reverted_edits flag to true after adding a revision" do
|
89
|
+
revision = build :empty_revision
|
90
|
+
@page.add_revision(revision)
|
91
|
+
expect(@page.instance_variable_get(:@update_reverted_edits)).to be true
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe "#reverted_edits" do
|
96
|
+
|
97
|
+
it {should respond_to :reverted_edits }
|
98
|
+
|
99
|
+
it "returns reverted edits by comparing the sha1 values" do
|
100
|
+
# principle:
|
101
|
+
# in edit wars the in-between of the first revert triple which has another hash before
|
102
|
+
# can be seen as vandalism (here revision with id 2)
|
103
|
+
|
104
|
+
revision_1 = build(:empty_revision, id: 1, parent_id: nil, sha1: 'hash0')
|
105
|
+
revision_2 = build(:empty_revision, id: 2, parent_id: 1, sha1: 'hash1')
|
106
|
+
revision_3 = build(:empty_revision, id: 3, parent_id: 2, sha1: 'hash2')
|
107
|
+
revision_4 = build(:empty_revision, id: 4, parent_id: 3, sha1: 'hash1')
|
108
|
+
revision_5 = build(:empty_revision, id: 5, parent_id: 4, sha1: 'hash2')
|
109
|
+
revision_6 = build(:empty_revision, id: 6, parent_id: 5, sha1: 'hash3')
|
110
|
+
|
111
|
+
@page.add_revision(revision_3)
|
112
|
+
@page.add_revision(revision_6)
|
113
|
+
@page.add_revision(revision_1)
|
114
|
+
@page.add_revision(revision_5)
|
115
|
+
@page.add_revision(revision_4)
|
116
|
+
@page.add_revision(revision_2)
|
117
|
+
|
118
|
+
expect(@page.reverted_edits.map { |edit| edit.new_revision.id }).to eq [3]
|
119
|
+
end
|
120
|
+
|
121
|
+
it "returns reverted edit if no previous revision is available" do
|
122
|
+
revision_1 = build(:empty_revision, id: 1, parent_id: nil, sha1: 'hash1')
|
123
|
+
revision_2 = build(:empty_revision, id: 2, parent_id: 1, sha1: 'hash2')
|
124
|
+
revision_3 = build(:empty_revision, id: 3, parent_id: 2, sha1: 'hash1')
|
125
|
+
revision_4 = build(:empty_revision, id: 4, parent_id: 3, sha1: 'hash2')
|
126
|
+
|
127
|
+
@page.add_revision(revision_3)
|
128
|
+
@page.add_revision(revision_1)
|
129
|
+
@page.add_revision(revision_4)
|
130
|
+
@page.add_revision(revision_2)
|
131
|
+
|
132
|
+
expect(@page.reverted_edits.map { |edit| edit.new_revision.id }).to eq [2]
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wikipedia::VandalismDetection::RevisionParser do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@parser = Wikipedia::VandalismDetection::RevisionParser.new
|
7
|
+
@xml = load_file('revision_simplified.xml')
|
8
|
+
|
9
|
+
@revision = @parser.parse @xml
|
10
|
+
@expected_revision = build(:empty_revision,
|
11
|
+
id: 'id1',
|
12
|
+
parent_id: 'parentid1',
|
13
|
+
timestamp: 'time1',
|
14
|
+
contributor: 'ip1',
|
15
|
+
comment: 'comment 1',
|
16
|
+
text: "text 1",
|
17
|
+
sha1: 'hash1')
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#parse" do
|
21
|
+
it "returns a Wikipedia::Revision object" do
|
22
|
+
expect(@revision).to be_a Wikipedia::VandalismDetection::Revision
|
23
|
+
end
|
24
|
+
|
25
|
+
it "returns a revision with only the configured properties" do
|
26
|
+
@revision = @parser.parse(@xml, only: [:id, :parent_id])
|
27
|
+
|
28
|
+
[:id, :parent_id].each do |attr|
|
29
|
+
expect(@revision.send(attr)).not_to be_nil
|
30
|
+
end
|
31
|
+
|
32
|
+
[:timestamp, :contributor, :sha1].each do |attr|
|
33
|
+
expect(@revision.send(attr)).to be_nil
|
34
|
+
end
|
35
|
+
|
36
|
+
[:comment, :text].each do |attr|
|
37
|
+
expect(@revision.send(attr)).to eq ""
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe "a single revison content parsing" do
|
43
|
+
it "can read a single revsion dump text input" do
|
44
|
+
expect(@revision).to_not be_nil
|
45
|
+
end
|
46
|
+
|
47
|
+
[:id, :timestamp, :contributor, :comment, :text, :sha1].each do |attr|
|
48
|
+
it "has the expected #{attr}" do
|
49
|
+
expect(@revision.send(attr)).to eq @expected_revision.send(attr)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wikipedia::VandalismDetection::Revision do
|
4
|
+
|
5
|
+
describe "constants" do
|
6
|
+
|
7
|
+
it "has a START_TAG constant" do
|
8
|
+
expect(Wikipedia::VandalismDetection::Revision::START_TAG).to eq '<revision>'
|
9
|
+
end
|
10
|
+
|
11
|
+
it "has an END_TAG constant" do
|
12
|
+
expect(Wikipedia::VandalismDetection::Revision::END_TAG).to eq '</revision>'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
before do
|
17
|
+
@revision = Wikipedia::VandalismDetection::Revision.new
|
18
|
+
@instance_variables = [:id, :parent_id, :timestamp, :comment, :text, :contributor_id, :contributor_ip, :sha1]
|
19
|
+
@nil_instance_variables = [:id, :parent_id, :timestamp, :contributor_id, :contributor_ip, :contributor_username, :sha1]
|
20
|
+
@read_only_attributes = [:contributor_id, :contributor_ip]
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#anonymous_user?" do
|
24
|
+
it { should respond_to :anonymous_contributor? }
|
25
|
+
|
26
|
+
it "returns true in case of an anonymous user" do
|
27
|
+
@anonymous_revision = build :anonymous_revision
|
28
|
+
expect(@anonymous_revision.anonymous_contributor?).to be true
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "#contributor=" do
|
33
|
+
it { should respond_to :contributor= }
|
34
|
+
|
35
|
+
it "sets the @contributor_id if contributor is no IPv4" do
|
36
|
+
id = "12345"
|
37
|
+
@revision.contributor = id
|
38
|
+
|
39
|
+
expect(@revision.instance_variable_get(:@contributor_id)).to eq id
|
40
|
+
expect(@revision.instance_variable_get(:@contributor_ip)).to be_nil
|
41
|
+
end
|
42
|
+
|
43
|
+
it "sets the @contributor_ip if contributor is an IPv4" do
|
44
|
+
ip = "127.0.0.1"
|
45
|
+
@revision.contributor = ip
|
46
|
+
|
47
|
+
expect(@revision.instance_variable_get(:@contributor_ip)).to eq ip
|
48
|
+
expect(@revision.instance_variable_get(:@contributor_id)).to be_nil
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "#contributor" do
|
53
|
+
it { should respond_to :contributor }
|
54
|
+
|
55
|
+
it "returns the contributor_id if set" do
|
56
|
+
id = "12345"
|
57
|
+
@revision.contributor = id
|
58
|
+
|
59
|
+
expect(@revision.contributor).to eq @revision.instance_variable_get(:@contributor_id)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "returns the contributor_ip if set" do
|
63
|
+
ip = "127.0.0.1"
|
64
|
+
@revision.contributor = ip
|
65
|
+
|
66
|
+
expect(@revision.contributor).to eq @revision.instance_variable_get(:@contributor_ip)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
it "has the revision attributes" do
|
71
|
+
@instance_variables.each do |name|
|
72
|
+
expect(@revision).to respond_to name
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
it "defaults its attributes to nil" do
|
77
|
+
@nil_instance_variables.each do |name|
|
78
|
+
expect(@revision.send(name)).to be_nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
it "raises an NoMethod error while accessing read only attributes" do
|
83
|
+
@read_only_attributes.each do |name|
|
84
|
+
expect { @revision.send("#{name}=", "") }.to raise_error NoMethodError
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
it "has an empty default text" do
|
89
|
+
expect(@revision.text).to be_empty
|
90
|
+
end
|
91
|
+
|
92
|
+
it "has a text of type Wikipedia::Text" do
|
93
|
+
expect(@revision.text).to be_a Wikipedia::VandalismDetection::Text
|
94
|
+
end
|
95
|
+
|
96
|
+
it "has an empty default comment" do
|
97
|
+
expect(@revision.comment).to be_empty
|
98
|
+
end
|
99
|
+
|
100
|
+
it "has a comment of type Wikipedia::Text" do
|
101
|
+
expect(@revision.comment).to be_a Wikipedia::VandalismDetection::Text
|
102
|
+
end
|
103
|
+
|
104
|
+
it { should respond_to :redirect? }
|
105
|
+
|
106
|
+
it "is marked as redirect if #REDIRECT appears in its text" do
|
107
|
+
@revision.text = "#REDIRECT [[Redirect Page Name]]\n"
|
108
|
+
expect(@revision.redirect?).to be true
|
109
|
+
end
|
110
|
+
|
111
|
+
it "is not marked as redirect if #REDIRECT does not appear in its text" do
|
112
|
+
@revision.text = "''text''"
|
113
|
+
expect(@revision.redirect?).to be false
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,231 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'ruby-band'
|
4
|
+
|
5
|
+
describe Wikipedia::VandalismDetection::TestDataset do
|
6
|
+
|
7
|
+
before do
|
8
|
+
use_test_configuration
|
9
|
+
@config = test_config
|
10
|
+
|
11
|
+
@arff_file = @config.test_output_arff_file
|
12
|
+
@index_file = @config.test_output_index_file
|
13
|
+
@features = @config.features
|
14
|
+
|
15
|
+
@arff_files_dir = File.join(@config.output_base_directory, 'test')
|
16
|
+
end
|
17
|
+
|
18
|
+
after do
|
19
|
+
if File.exists?(@arff_file)
|
20
|
+
File.delete(@arff_file)
|
21
|
+
FileUtils.rm_r(File.dirname @arff_file)
|
22
|
+
end
|
23
|
+
|
24
|
+
File.delete(@index_file) if File.exists?(@index_file)
|
25
|
+
|
26
|
+
# remove feature arff files
|
27
|
+
@config.features.each do |name|
|
28
|
+
file = File.join(@arff_files_dir, name.gsub(' ', '_') + '.arff')
|
29
|
+
|
30
|
+
if File.exists?(file)
|
31
|
+
File.delete(file)
|
32
|
+
FileUtils.rm_r(File.dirname file)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#build" do
|
38
|
+
describe "exceptions" do
|
39
|
+
it "raises an EditsFileNotConfiguredError if no edits file is configured" do
|
40
|
+
config = test_config
|
41
|
+
config.instance_variable_set :@test_corpus_edits_file, nil
|
42
|
+
use_configuration(config)
|
43
|
+
|
44
|
+
expect { Wikipedia::VandalismDetection::TestDataset.build }.to raise_error \
|
45
|
+
Wikipedia::VandalismDetection::EditsFileNotConfiguredError
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
it "returns a weka instances" do
|
50
|
+
dataset = Wikipedia::VandalismDetection::TestDataset.build
|
51
|
+
expect(dataset.class).to be Java::WekaCore::Instances
|
52
|
+
end
|
53
|
+
|
54
|
+
Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS['features'].each do |name|
|
55
|
+
it "creates an arff file for the feature '#{name}'" do
|
56
|
+
config = test_config
|
57
|
+
config.instance_variable_set :@features, [name]
|
58
|
+
use_configuration(config)
|
59
|
+
|
60
|
+
file = File.join(@arff_files_dir, name.gsub(' ', '_') + '.arff')
|
61
|
+
|
62
|
+
expect(File.exist?(file)).to be false
|
63
|
+
Wikipedia::VandalismDetection::TestDataset.build
|
64
|
+
expect(File.exist?(file)).to be true
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
it "creates only feature files that are not available yet" do
|
69
|
+
config = test_config
|
70
|
+
config.instance_variable_set :@features, ['anonymity', 'comment length']
|
71
|
+
use_configuration(config)
|
72
|
+
|
73
|
+
anonymity_file = File.join(config.output_base_directory, 'test', 'anonymity.arff')
|
74
|
+
|
75
|
+
# create file manually, so it is existent when building dataset
|
76
|
+
data = [10000, 123456, 234567]
|
77
|
+
anonymity = Wikipedia::VandalismDetection::Instances.empty_for_test_feature('anonymity')
|
78
|
+
6.times { anonymity.add_instance(data) }
|
79
|
+
anonymity.to_ARFF(anonymity_file)
|
80
|
+
|
81
|
+
Wikipedia::VandalismDetection::TestDataset.build
|
82
|
+
|
83
|
+
# anonymity should not be overwritten
|
84
|
+
expect(Core::Parser.parse_ARFF(anonymity_file).to_a2d.first).to eq data
|
85
|
+
end
|
86
|
+
|
87
|
+
describe "internal algorithm" do
|
88
|
+
it "builds the right number of data lines" do
|
89
|
+
edits_count = File.open(@config.training_corpus_edits_file, 'r').lines.count - 1
|
90
|
+
additional_header_lines = 4 # without class
|
91
|
+
revision_id_lines = 2 # old and new revision id attributes
|
92
|
+
class_line = 1
|
93
|
+
|
94
|
+
lines_count = additional_header_lines + edits_count + @features.count + revision_id_lines + class_line
|
95
|
+
dataset = Wikipedia::VandalismDetection::TestDataset.build
|
96
|
+
|
97
|
+
expect(dataset.to_s.lines.count).to eq lines_count
|
98
|
+
end
|
99
|
+
|
100
|
+
it "builds the right number of data columns" do
|
101
|
+
old_and_new_edit_attr_count = 2
|
102
|
+
class_value = 1
|
103
|
+
dataset = Wikipedia::VandalismDetection::TestDataset.build
|
104
|
+
|
105
|
+
expect(dataset.n_col).to eq @features.count + class_value + old_and_new_edit_attr_count
|
106
|
+
end
|
107
|
+
|
108
|
+
it "builds a class attribute" do
|
109
|
+
dataset = Wikipedia::VandalismDetection::TestDataset.build
|
110
|
+
expect(dataset.enumerate_attributes.to_a[-1].name).to eq 'class'
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
it "normalizes the numeric features if LibSVM is used as classifier" do
|
115
|
+
config = test_config
|
116
|
+
config.instance_variable_set :@classifier_type, 'Functions::LibSVM'
|
117
|
+
use_configuration(config)
|
118
|
+
|
119
|
+
dataset = Wikipedia::VandalismDetection::TestDataset.build
|
120
|
+
puts dataset
|
121
|
+
|
122
|
+
dataset.to_a2d.each do |instance|
|
123
|
+
puts instance.to_s
|
124
|
+
numerics = instance[0...-3] # feature values
|
125
|
+
edit_ids = instance[-3..-2] # revision ids
|
126
|
+
|
127
|
+
numerics.each { |value| expect(value).to be_between(0.0, 1.0) }
|
128
|
+
edit_ids.each { |value| expect(value).to be > 1 }
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
describe "#instances" do
|
134
|
+
it "is an alias method for #build" do
|
135
|
+
build = Wikipedia::VandalismDetection::TestDataset.build
|
136
|
+
instances = Wikipedia::VandalismDetection::TestDataset.instances
|
137
|
+
|
138
|
+
expect(build.to_s).to eq instances.to_s
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
describe "#create_corpus_index_file!" do
|
143
|
+
it "responds to #create_corpus_file_index!" do
|
144
|
+
expect(Wikipedia::VandalismDetection::TestDataset).to respond_to :create_corpus_file_index!
|
145
|
+
end
|
146
|
+
|
147
|
+
describe "exceptions" do
|
148
|
+
it "raises an RevisionsDirectoryNotConfiguredError if no revisions directory is configured" do
|
149
|
+
config = test_config
|
150
|
+
config.instance_variable_set :@test_corpus_revisions_directory, nil
|
151
|
+
use_configuration(config)
|
152
|
+
|
153
|
+
expect { Wikipedia::VandalismDetection::TestDataset.create_corpus_file_index! }.to raise_error \
|
154
|
+
Wikipedia::VandalismDetection::RevisionsDirectoryNotConfiguredError
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
it "creates a corpus_index.yml file in the build directory" do
|
159
|
+
expect(File.exist?(@index_file)).to be false
|
160
|
+
Wikipedia::VandalismDetection::TestDataset.create_corpus_file_index!
|
161
|
+
expect(File.exist?(@index_file)).to be true
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
describe "#build!" do
|
166
|
+
it "should respond to #build!" do
|
167
|
+
expect(Wikipedia::VandalismDetection::TestDataset).to respond_to :build!
|
168
|
+
end
|
169
|
+
|
170
|
+
it "creates an .arff file in the directory defined in config.yml" do
|
171
|
+
expect(File.exist?(@arff_file)).to be false
|
172
|
+
Wikipedia::VandalismDetection::TestDataset.build!
|
173
|
+
expect(File.exist?(@arff_file)).to be true
|
174
|
+
end
|
175
|
+
|
176
|
+
it "overwrites existing test arff file" do
|
177
|
+
use_test_configuration
|
178
|
+
|
179
|
+
# test config uses 3 features + 2 edit id columns + 1 class value = 6
|
180
|
+
Wikipedia::VandalismDetection::TestDataset.build!
|
181
|
+
first_parsed_dataset = Core::Parser.parse_ARFF(@arff_file)
|
182
|
+
expect(first_parsed_dataset.n_col).to eq 6
|
183
|
+
|
184
|
+
config = test_config
|
185
|
+
config.instance_variable_set(:@features, ['anonymity'])
|
186
|
+
use_configuration(config)
|
187
|
+
|
188
|
+
# uses only 1 feature + 2 edit id columns + 1 class vlaue = 4
|
189
|
+
Wikipedia::VandalismDetection::TestDataset.build!
|
190
|
+
second_parsed_dataset = Core::Parser.parse_ARFF(@arff_file)
|
191
|
+
|
192
|
+
expect(second_parsed_dataset.n_col).to eq 4
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
describe "#edit" do
|
197
|
+
it "raises an EditsFileNotConfiguredError if no edits file is configured" do
|
198
|
+
config = test_config
|
199
|
+
config.instance_variable_set :@test_corpus_edits_file, nil
|
200
|
+
use_configuration(config)
|
201
|
+
|
202
|
+
expect { Wikipedia::VandalismDetection::TestDataset.edit('1', '2') }.to raise_error \
|
203
|
+
Wikipedia::VandalismDetection::EditsFileNotConfiguredError
|
204
|
+
end
|
205
|
+
|
206
|
+
it "returns nil if Edit could not be found" do
|
207
|
+
edit = Wikipedia::VandalismDetection::TestDataset.edit('1', '2')
|
208
|
+
expect(edit).to be_nil
|
209
|
+
end
|
210
|
+
|
211
|
+
it "returns an Edit" do
|
212
|
+
edit = Wikipedia::VandalismDetection::TestDataset.edit('307084144', '326873205')
|
213
|
+
expect(edit).to be_a Wikipedia::VandalismDetection::Edit
|
214
|
+
end
|
215
|
+
|
216
|
+
it "returns an edit whose parent page title is not nil" do
|
217
|
+
edit = Wikipedia::VandalismDetection::TestDataset.edit('307084144', '326873205')
|
218
|
+
expect(edit.page.title).to_not be_nil
|
219
|
+
end
|
220
|
+
|
221
|
+
it "returns an edit whose parent page id is not nil" do
|
222
|
+
edit = Wikipedia::VandalismDetection::TestDataset.edit('307084144', '326873205')
|
223
|
+
expect(edit.page.id).to_not be_nil
|
224
|
+
end
|
225
|
+
|
226
|
+
it "returns nil for a not annotated edit with given revision ids" do
|
227
|
+
edit = Wikipedia::VandalismDetection::TestDataset.edit('328774088', '328774188')
|
228
|
+
expect(edit).to be_nil
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|