wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::FeatureCalculator do
|
|
4
|
+
let(:edit) { build(:edit) }
|
|
5
|
+
|
|
6
|
+
it 'raises NoFeaturesConfiguredError when no features are configured' do
|
|
7
|
+
config = Wikipedia::VandalismDetection::Configuration.send(:new)
|
|
8
|
+
config.instance_variable_set(:@features, nil)
|
|
9
|
+
|
|
10
|
+
use_configuration(config)
|
|
11
|
+
|
|
12
|
+
expect { Wikipedia::VandalismDetection::FeatureCalculator.new }
|
|
13
|
+
.to raise_error Wikipedia::VandalismDetection::FeaturesNotConfiguredError
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
before do
|
|
17
|
+
use_test_configuration
|
|
18
|
+
@calculator = Wikipedia::VandalismDetection::FeatureCalculator.new
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
describe '#calculate_features_for' do
|
|
22
|
+
it { is_expected.to respond_to :calculate_features_for }
|
|
23
|
+
|
|
24
|
+
it 'takes an edit as parameter' do
|
|
25
|
+
expect { @calculator.calculate_features_for(edit) }
|
|
26
|
+
.not_to raise_error ArgumentError
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'raises an error if called with wrong parameter type' do
|
|
30
|
+
revision = build(:empty_revision)
|
|
31
|
+
expect { @calculator.calculate_features_for(revision) }
|
|
32
|
+
.to raise_error ArgumentError
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it 'returns an array' do
|
|
36
|
+
expect(@calculator.calculate_features_for(edit)).to be_an Array
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'returns the computed numeric feature values' do
|
|
40
|
+
feature_values = @calculator.calculate_features_for(edit)
|
|
41
|
+
expect(feature_values.all? { |value| value.is_a?(Numeric) }).to be true
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it 'returns the right number of feature values' do
|
|
45
|
+
count = @calculator.used_features.count
|
|
46
|
+
expect(@calculator.calculate_features_for(edit).count).to eq count
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it 'uses the cleaned up text if revision contains a #REDIRECT' do
|
|
50
|
+
redirect_text = Text.new('#REDIRECT [[Redirect page]]')
|
|
51
|
+
old_revision_redirect = build(:old_revision, text: redirect_text)
|
|
52
|
+
new_revision_redirect = build(:new_revision, text: redirect_text)
|
|
53
|
+
old_revision = build(:old_revision)
|
|
54
|
+
new_revision = build(:new_revision)
|
|
55
|
+
|
|
56
|
+
edit_a = Wikipedia::VandalismDetection::Edit.new(old_revision_redirect, new_revision)
|
|
57
|
+
edit_b = Wikipedia::VandalismDetection::Edit.new(old_revision, new_revision_redirect)
|
|
58
|
+
|
|
59
|
+
config = Wikipedia::VandalismDetection.config
|
|
60
|
+
count = config.features.count
|
|
61
|
+
|
|
62
|
+
expect(@calculator.calculate_features_for(edit_a).count).to eq count
|
|
63
|
+
expect(@calculator.calculate_features_for(edit_b).count).to eq count
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
it 'includes -1 for not extractable texts in either revision' do
|
|
67
|
+
config = Wikipedia::VandalismDetection::Configuration.instance
|
|
68
|
+
config.instance_variable_set(:@features, ['all wordlists impact'])
|
|
69
|
+
|
|
70
|
+
use_configuration(config)
|
|
71
|
+
|
|
72
|
+
unparsable_wiki_text = Text.new("[[Image:img.jpg|\n{|\n|-\n|||| |}")
|
|
73
|
+
|
|
74
|
+
old_revision_unparsable = build(:old_revision, text: unparsable_wiki_text)
|
|
75
|
+
new_revision_unparsable = build(:new_revision, text: unparsable_wiki_text)
|
|
76
|
+
|
|
77
|
+
old_revision = build(:old_revision)
|
|
78
|
+
new_revision = build(:new_revision)
|
|
79
|
+
|
|
80
|
+
edit_a = Wikipedia::VandalismDetection::Edit.new(old_revision_unparsable, new_revision)
|
|
81
|
+
edit_b = Wikipedia::VandalismDetection::Edit.new(old_revision, new_revision_unparsable)
|
|
82
|
+
|
|
83
|
+
expect(subject.calculate_features_for(edit_a)).to include Features::MISSING_VALUE
|
|
84
|
+
expect(subject.calculate_features_for(edit_b)).to include Features::MISSING_VALUE
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
describe '#claculate_feature_for' do
|
|
89
|
+
let(:feature_name) { 'anonymity' }
|
|
90
|
+
let(:random_number) { rand(1000) }
|
|
91
|
+
let(:empty_revision) { build(:empty_revision) }
|
|
92
|
+
|
|
93
|
+
before do
|
|
94
|
+
allow_any_instance_of(Features::Anonymity)
|
|
95
|
+
.to receive(:calculate)
|
|
96
|
+
.and_return(random_number)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
it { is_expected.to respond_to :calculate_feature_for }
|
|
100
|
+
|
|
101
|
+
it 'takes an edit and feature name as parameter' do
|
|
102
|
+
expect { @calculator.calculate_feature_for(edit, feature_name) }
|
|
103
|
+
.not_to raise_error ArgumentError
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it 'raises an error if called with wrong parameter type edit' do
|
|
107
|
+
expect { @calculator.calculate_feature_for(empty_revision, feature_name) }
|
|
108
|
+
.to raise_error ArgumentError
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
it 'raises an error if called with wrong parameter type feature name' do
|
|
112
|
+
expect { @calculator.calculate_feature_for(edit, empty_revision) }
|
|
113
|
+
.to raise_error ArgumentError
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
it 'returns a Numeric' do
|
|
117
|
+
expect(@calculator.calculate_feature_for(edit, feature_name))
|
|
118
|
+
.to be_a Numeric
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
it 'returns the value calculated by the feature class' do
|
|
122
|
+
expect(@calculator.calculate_feature_for(edit, feature_name))
|
|
123
|
+
.to eq random_number
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
describe '#used_features' do
|
|
128
|
+
it { is_expected.to respond_to :used_features }
|
|
129
|
+
|
|
130
|
+
it 'returns an array of the features defined in the config feature.yml' do
|
|
131
|
+
features = Wikipedia::VandalismDetection.config.features
|
|
132
|
+
expect(@calculator.used_features).to match_array features
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::AllWordlistsFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the inserted number of all lists words over all inserted' do
|
|
8
|
+
# inserted: total 7 words, 1 vulgarism, 1 biased, 1 pronouns = 3 bad
|
|
9
|
+
old_text = Text.new('Your old shit. ')
|
|
10
|
+
new_text = Text.new('Your old shit. Fuck you great, and all the others.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 3.0 / 7.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 on empty clean inserted text' do
|
|
20
|
+
old_text = Text.new('Your old shit. ')
|
|
21
|
+
new_text = Text.new('Your old shit. {{speedy deletion}}')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::AllWordlistsImpact do
|
|
4
|
+
it { is_expected.to be_a Features::ImpactBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the impact of all wordlists words of the new revision text' do
|
|
8
|
+
# 1 vulgarism, 2 pronouns, 0 biased = 3 bad
|
|
9
|
+
old_text = Text.new('Fuck you, you and all the others')
|
|
10
|
+
# 3 vulgarism, 3 pronouns, 1 biased = 7 bad
|
|
11
|
+
new_text = Text.new('Fuck you great, fuck you, you all others sluts')
|
|
12
|
+
|
|
13
|
+
old_rev = build(:old_revision, text: old_text)
|
|
14
|
+
new_rev = build(:new_revision, text: new_text)
|
|
15
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
16
|
+
|
|
17
|
+
expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 7.0)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
it 'returns 0.5 if both text revisions have no terms' do
|
|
21
|
+
text = Text.new('{{speedy deletion}}')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: text)
|
|
24
|
+
new_rev = build(:new_revision, text: text)
|
|
25
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.5
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
it 'returns 0.0 for an emtpy clean text in the old revision' do
|
|
31
|
+
old_text = Text.new('{{speedy deletion}}')
|
|
32
|
+
new_text = Text.new('fuck')
|
|
33
|
+
|
|
34
|
+
old_rev = build(:old_revision, text: old_text)
|
|
35
|
+
new_rev = build(:new_revision, text: new_text)
|
|
36
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
37
|
+
|
|
38
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it 'returns 1.0 for an emtpy clean text in the new revision' do
|
|
42
|
+
old_text = Text.new('fuck')
|
|
43
|
+
new_text = Text.new('{{speedy deletion}}')
|
|
44
|
+
|
|
45
|
+
old_rev = build(:old_revision, text: old_text)
|
|
46
|
+
new_rev = build(:new_revision, text: new_text)
|
|
47
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
48
|
+
|
|
49
|
+
expect(subject.calculate(edit)).to eq 1.0
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::AnonymityPrevious do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
context 'both contributors are given' do
|
|
8
|
+
it 'return 1.0 in case of an registered previous editor' do
|
|
9
|
+
old_rev = build(:old_revision, contributor: 'Peter')
|
|
10
|
+
new_rev = build(:new_revision)
|
|
11
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
12
|
+
|
|
13
|
+
expect(subject.calculate(edit)).to eq 1
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it 'returns 0.0 in case of an anonymous previous editor' do
|
|
17
|
+
old_rev = build(:old_revision, contributor: '137.163.16.199')
|
|
18
|
+
new_rev = build(:new_revision)
|
|
19
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
20
|
+
|
|
21
|
+
expect(subject.calculate(edit)).to eq 0
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
context 'previous contributor not given' do
|
|
26
|
+
context 'for a registered previous editor' do
|
|
27
|
+
it 'requests the user from Wikipedia API and returns 1' do
|
|
28
|
+
old_rev = build(:old_revision, id: '324557983', contributor: nil)
|
|
29
|
+
new_rev = build(
|
|
30
|
+
:new_revision,
|
|
31
|
+
id: '329962649',
|
|
32
|
+
parent_id: '324557983',
|
|
33
|
+
contributor: 'Tomaxer'
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
37
|
+
|
|
38
|
+
expect(subject.calculate(edit)).to eq 1
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
context 'for an anonymous previous editor' do # also same editor!
|
|
43
|
+
it 'requests the user from Wikipedia API and returns 0' do
|
|
44
|
+
old_rev = build(:old_revision, id: '328774110', contributor: nil)
|
|
45
|
+
new_rev = build(:new_revision, id: '328774035', parent_id: '328774110')
|
|
46
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
47
|
+
|
|
48
|
+
expect(subject.calculate(edit)).to eq 0
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
context 'if old reivision is not available anymore' do
|
|
53
|
+
it 'returns missing' do
|
|
54
|
+
# to get api call, see:
|
|
55
|
+
# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=timestamp&revids=325218985
|
|
56
|
+
# <rev revid="325218985"/>
|
|
57
|
+
|
|
58
|
+
old_rev = build(:old_revision, id: '325218985', contributor: nil)
|
|
59
|
+
new_rev = build(:new_revision, id: '326980599', parent_id: '325218985')
|
|
60
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
61
|
+
|
|
62
|
+
expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::Anonymity do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'return 1.0 in case of an registered editor' do
|
|
8
|
+
edit = build :edit, new_revision: build(:registered_revision)
|
|
9
|
+
expect(subject.calculate(edit)).to eq 1
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it 'returns 0.0 in case of an anonymous editor' do
|
|
13
|
+
edit = build :edit, new_revision: build(:anonymous_revision)
|
|
14
|
+
expect(subject.calculate(edit)).to eq 0
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::ArticleSize do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the size of the edit’s new revisions' do
|
|
8
|
+
old_rev_text = Text.new('123')
|
|
9
|
+
new_rev_text = Text.new('123 456789') # size 10 (with spaces)
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_rev_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_rev_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 10
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it "returns 0 if the edit's new revisions is empty" do
|
|
19
|
+
old_rev_text = Text.new('123')
|
|
20
|
+
new_rev_text = Text.new # size 0
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_rev_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_rev_text)
|
|
24
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq 0
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::BadFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of vulgarism words relative to all words count' do
|
|
8
|
+
# total 6 words, 3 bad
|
|
9
|
+
old_text = Text.new('Old whatever.')
|
|
10
|
+
new_text = Text.new('Old whatever. New ugly contents. Hi, gotta fun.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 3.0 / 6.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 for an emtpy clean text in the new revision' do
|
|
20
|
+
old_text = Text.new('Old guy.')
|
|
21
|
+
new_text = Text.new('Old guy. {{speedy deletion}}')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::BadImpact do
|
|
4
|
+
it { is_expected.to be_a Features::ImpactBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the impact of bad words of the edit’s new revision text' do
|
|
8
|
+
# 3 bad words
|
|
9
|
+
old_text = Text.new('Hi, old text 666, dont know')
|
|
10
|
+
|
|
11
|
+
# 4 bad words
|
|
12
|
+
new_text = Text.new('Hi, new text dosent, whatever, guy')
|
|
13
|
+
|
|
14
|
+
old_rev = build(:old_revision, text: old_text)
|
|
15
|
+
new_rev = build(:new_revision, text: new_text)
|
|
16
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
17
|
+
|
|
18
|
+
expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 4.0)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it 'returns 0.5 on both no terms in text revisions' do
|
|
22
|
+
text = Text.new('{speedy deletion}}')
|
|
23
|
+
|
|
24
|
+
old_rev = build(:old_revision, text: text)
|
|
25
|
+
new_rev = build(:new_revision, text: text)
|
|
26
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
27
|
+
|
|
28
|
+
expect(subject.calculate(edit)).to eq 0.5
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it 'returns 0.0 on emtpy clean text of old revision' do
|
|
32
|
+
old_text = Text.new('{{speedy deletion}}')
|
|
33
|
+
new_text = Text.new('Guy')
|
|
34
|
+
|
|
35
|
+
old_rev = build(:old_revision, text: old_text)
|
|
36
|
+
new_rev = build(:new_revision, text: new_text)
|
|
37
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
38
|
+
|
|
39
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
it 'returns 1.0 on emtpy clean text of new revision' do
|
|
43
|
+
old_text = Text.new('Guy')
|
|
44
|
+
new_text = Text.new('{{speedy deletion}}')
|
|
45
|
+
|
|
46
|
+
old_rev = build(:old_revision, text: old_text)
|
|
47
|
+
new_rev = build(:new_revision, text: new_text)
|
|
48
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
49
|
+
|
|
50
|
+
expect(subject.calculate(edit)).to eq 1.0
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::Base do
|
|
4
|
+
describe '#count' do
|
|
5
|
+
let(:text) { 'I, you: i will help You' }
|
|
6
|
+
|
|
7
|
+
it { is_expected.to respond_to(:count).with(2).arguments }
|
|
8
|
+
|
|
9
|
+
it 'raises an error if option :in is not defined' do
|
|
10
|
+
expect { subject.count(%i[i you], from: text) }
|
|
11
|
+
.to raise_error ArgumentError
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
it 'raises an error if terms is not an Array or String' do
|
|
15
|
+
expect { subject.count({ term: 'You' }, in: text) }
|
|
16
|
+
.to raise_error ArgumentError
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns the total number of terms found for the given terms array' do
|
|
20
|
+
terms = %i[i you]
|
|
21
|
+
expect(subject.count(terms, in: text)).to eq 4
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it 'returns the number of terms found for the given single term' do
|
|
25
|
+
expect(subject.count('You', in: text)).to eq 2
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
describe '#calculate' do
|
|
30
|
+
it { is_expected.to respond_to :calculate }
|
|
31
|
+
|
|
32
|
+
it 'takes an Wikipedia::Edit as argument' do
|
|
33
|
+
edit = build(:edit)
|
|
34
|
+
expect { subject.calculate(edit) }.not_to raise_error ArgumentError
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'raises an ArgumentError if argument is no Wikipedia::Edit' do
|
|
38
|
+
expect { subject.calculate('string') }.to raise_error ArgumentError
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::BiasedFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the inserted number of biased words over all inserted words' do
|
|
8
|
+
# inserted: total 7 words, 3 biased
|
|
9
|
+
old_text = Text.new('Great old.')
|
|
10
|
+
new_text = Text.new('Great old. This is so great, really a classic.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 3.0 / 7.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 on emtpy clean inserted text' do
|
|
20
|
+
old_text = Text.new('Great old.')
|
|
21
|
+
new_text = Text.new('Great old. {{speedy deletion}}')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::BiasedImpact do
|
|
4
|
+
it { is_expected.to be_a Features::ImpactBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the impact of biased words of the new revision text' do
|
|
8
|
+
# 1 vulgarism
|
|
9
|
+
old_text = Text.new('this is classic!')
|
|
10
|
+
# 3 vulgarism
|
|
11
|
+
new_text = Text.new('This is classic, legendary and amazing!')
|
|
12
|
+
|
|
13
|
+
old_rev = build(:old_revision, text: old_text)
|
|
14
|
+
new_rev = build(:new_revision, text: new_text)
|
|
15
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
16
|
+
|
|
17
|
+
expect(subject.calculate(edit)).to eq 1.0 / (1.0 + 3.0)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
it 'returns 0.5 on both no terms in text revisions' do
|
|
21
|
+
text = Text.new('{{speedy deletion}}')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: text)
|
|
24
|
+
new_rev = build(:new_revision, text: text)
|
|
25
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.5
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
it 'returns 0.0 on emtpy clean text of old revision' do
|
|
31
|
+
old_text = Text.new('{{speedy deletion}}')
|
|
32
|
+
new_text = Text.new('great')
|
|
33
|
+
|
|
34
|
+
old_rev = build(:old_revision, text: old_text)
|
|
35
|
+
new_rev = build(:new_revision, text: new_text)
|
|
36
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
37
|
+
|
|
38
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it 'returns 1.0 on emtpy clean text of new revision' do
|
|
42
|
+
old_text = Text.new('great')
|
|
43
|
+
new_text = Text.new('{{speedy deletion}}')
|
|
44
|
+
|
|
45
|
+
old_rev = build(:old_revision, text: old_text)
|
|
46
|
+
new_rev = build(:new_revision, text: new_text)
|
|
47
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
48
|
+
|
|
49
|
+
expect(subject.calculate(edit)).to eq 1.0
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::Blanking do
|
|
4
|
+
let(:blank_text) { 'a' * (Features::Blanking::BLANKING_THRESHOLD - 1) }
|
|
5
|
+
|
|
6
|
+
it { is_expected.to be_a Features::Base }
|
|
7
|
+
|
|
8
|
+
describe '#calculate' do
|
|
9
|
+
it 'returns 1.0 in case of full blanking the new revision' do
|
|
10
|
+
# full blanking means size < BLANKING_THRESHOLD.
|
|
11
|
+
old_rev = build(:old_revision, text: "#{blank_text} additional text")
|
|
12
|
+
new_rev = build(:new_revision, text: blank_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 1.0
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0.0 in case of not full blanking the new revision' do
|
|
19
|
+
# not full blanking means size > BLANKING_THRESHOLD.
|
|
20
|
+
old_rev = build(:old_revision, text: "#{blank_text} additional text")
|
|
21
|
+
new_rev = build(:new_revision, text: "#{blank_text}a")
|
|
22
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
23
|
+
|
|
24
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'returns 0.0 if old revision is <= new revision' do
|
|
28
|
+
old_rev = build(:old_revision, text: blank_text)
|
|
29
|
+
new_rev = build(:new_revision, text: blank_text.next!)
|
|
30
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
31
|
+
|
|
32
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::CharacterDiversity do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the character diversity of the new inserted text' do
|
|
8
|
+
old_text = Text.new('text')
|
|
9
|
+
# 9 unique characters of total 14
|
|
10
|
+
new_text = Text.new('text [[aa ab cdeefg]]')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 14**(1.0 / 9)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 if no text inserted' do
|
|
20
|
+
old_text = Text.new('deletion text')
|
|
21
|
+
new_text = Text.new('text')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::CharacterSequence do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of the new revision’s longest character sequence' do
|
|
8
|
+
old_text = Text.new('a 666666')
|
|
9
|
+
new_text = Text.new("a 666666 4444ccc eefffff gggg g ''fffaffff''")
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 5
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0 if no text was inserted' do
|
|
20
|
+
old_text = Text.new('a 666666 4444ccc eeeefffff gggg g')
|
|
21
|
+
new_text = Text.new('a 666666 ')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
|
|
26
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
27
|
+
|
|
28
|
+
expect(subject.calculate(edit)).to eq 0
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::CommentBadFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of bad words in comment over all words' do
|
|
8
|
+
# total 11 words, 7 bad words
|
|
9
|
+
comment = Text.new('666 was 666 if 666 was 666 and guy are 666')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision)
|
|
12
|
+
new_rev = build(:new_revision, comment: comment)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 6.0 / 11.0
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0.0 on emtpy clean text comment' do
|
|
19
|
+
comment = Text.new('{{speedy deletion}}')
|
|
20
|
+
|
|
21
|
+
old_rev = build(:old_revision)
|
|
22
|
+
new_rev = build(:new_revision, comment: comment)
|
|
23
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
24
|
+
|
|
25
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|