wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::MarkupFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of markup related words over all inserted words' do
|
|
8
|
+
# total 4 words, 3 markup
|
|
9
|
+
old_text = Text.new('Old whatever.')
|
|
10
|
+
new_text = Text.new('Old whatever. {{template}} <ref>list</ref> [[heading]] boy.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 3.0 / 4.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 on no inserted text' do
|
|
20
|
+
text = 'Old guy.'
|
|
21
|
+
old_text = Text.new(text)
|
|
22
|
+
new_text = Text.new(text)
|
|
23
|
+
|
|
24
|
+
old_rev = build(:old_revision, text: old_text)
|
|
25
|
+
new_rev = build(:new_revision, text: new_text)
|
|
26
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
27
|
+
|
|
28
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::MarkupImpact do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the impact of markup words of the edit’s new revision text' do
|
|
8
|
+
# 3 markup words
|
|
9
|
+
old_text = '{{template}} <ref>reference</ref> [[hello]] hello'
|
|
10
|
+
|
|
11
|
+
# 4 markup words
|
|
12
|
+
new_text = '{{template}} <ref>reference</ref> [[hello]] cite dude'
|
|
13
|
+
|
|
14
|
+
old_rev = build(:old_revision, text: old_text)
|
|
15
|
+
new_rev = build(:new_revision, text: new_text)
|
|
16
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
17
|
+
|
|
18
|
+
expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 4.0)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it 'returns 0.5 on both no terms in text revisions' do
|
|
22
|
+
text = ''
|
|
23
|
+
|
|
24
|
+
old_rev = build(:old_revision, text: text)
|
|
25
|
+
new_rev = build(:new_revision, text: text)
|
|
26
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
27
|
+
|
|
28
|
+
expect(subject.calculate(edit)).to eq 0.5
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it 'returns 0.0 on emtpy text of old revision' do
|
|
32
|
+
old_text = ''
|
|
33
|
+
new_text = '{{template}}'
|
|
34
|
+
|
|
35
|
+
old_rev = build(:old_revision, text: old_text)
|
|
36
|
+
new_rev = build(:new_revision, text: new_text)
|
|
37
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
38
|
+
|
|
39
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
it 'returns 1.0 on emtpy text of new revision' do
|
|
43
|
+
old_text = '{{template}}'
|
|
44
|
+
new_text = ''
|
|
45
|
+
|
|
46
|
+
old_rev = build(:old_revision, text: old_text)
|
|
47
|
+
new_rev = build(:new_revision, text: new_text)
|
|
48
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
49
|
+
|
|
50
|
+
expect(subject.calculate(edit)).to eq 1.0
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::NonAlphanumericRatio do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the non-alphanum to all letters ratio of the inserted text' do
|
|
8
|
+
old_text = Text.new('t$xt')
|
|
9
|
+
# 7 non-alphanumeric letters of total 15 letters
|
|
10
|
+
new_text = Text.new('t$xt [[1A$% 4B6]] 8Cd?')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq((1.0 + 7) / (1.0 + 15))
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 if no text was inserted' do
|
|
20
|
+
old_text = Text.new('deletion text')
|
|
21
|
+
new_text = Text.new('text')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::PersonalLife do
|
|
4
|
+
it { is_expected.to be_a Features::ContainsBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns 1 if the edit comment includes "personal life"' do
|
|
8
|
+
comment = Text.new('/* Personal life */ edited')
|
|
9
|
+
new_rev = build(:new_revision, comment: comment)
|
|
10
|
+
edit = build(:edit, new_revision: new_rev)
|
|
11
|
+
|
|
12
|
+
expect(subject.calculate(edit)).to eq 1
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
it 'returns 0 on emtpy comment' do
|
|
16
|
+
new_rev = build(:new_revision, comment: '')
|
|
17
|
+
edit = build(:edit, new_revision: new_rev)
|
|
18
|
+
|
|
19
|
+
expect(subject.calculate(edit)).to eq 0
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::PronounFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of pronouns relative to all words count' do
|
|
8
|
+
# total 10 words, 6 pronouns
|
|
9
|
+
old_text = Text.new('Your old.')
|
|
10
|
+
new_text = Text.new('Your old. I was you if You was we are ourselves us.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 6.0 / 10.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 for an emtpy clean text in the new revision' do
|
|
20
|
+
old_text = Text.new('Your old.')
|
|
21
|
+
new_text = Text.new('Your old. {{speedy deletion}}')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::PronounImpact do
|
|
4
|
+
it { is_expected.to be_a Features::ImpactBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the impact of pronouns of the new revision text' do
|
|
8
|
+
# 3 pronouns
|
|
9
|
+
old_text = Text.new('Your old text will be mine or Your’s')
|
|
10
|
+
|
|
11
|
+
# 4 pronouns
|
|
12
|
+
new_text = Text.new('My new text and your old text will be ours and mine')
|
|
13
|
+
|
|
14
|
+
old_rev = build(:old_revision, text: old_text)
|
|
15
|
+
new_rev = build(:new_revision, text: new_text)
|
|
16
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
17
|
+
|
|
18
|
+
expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 4.0)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it 'returns 0.5 if both text revisions include no terms' do
|
|
22
|
+
text = Text.new('{{speedy deletion}}')
|
|
23
|
+
|
|
24
|
+
old_rev = build(:old_revision, text: text)
|
|
25
|
+
new_rev = build(:new_revision, text: text)
|
|
26
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
27
|
+
|
|
28
|
+
expect(subject.calculate(edit)).to eq 0.5
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it 'returns 0.0 for emtpy clean text of old revision' do
|
|
32
|
+
old_text = Text.new('{{speedy deletion}}')
|
|
33
|
+
new_text = Text.new('You')
|
|
34
|
+
|
|
35
|
+
old_rev = build(:old_revision, text: old_text)
|
|
36
|
+
new_rev = build(:new_revision, text: new_text)
|
|
37
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
38
|
+
|
|
39
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
it 'returns 1.0 for emtpy clean text of new revision' do
|
|
43
|
+
old_text = Text.new('You')
|
|
44
|
+
new_text = Text.new('{{speedy deletion}}')
|
|
45
|
+
|
|
46
|
+
old_rev = build(:old_revision, text: old_text)
|
|
47
|
+
new_rev = build(:new_revision, text: new_text)
|
|
48
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
49
|
+
|
|
50
|
+
expect(subject.calculate(edit)).to eq 1.0
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedAllWordlistsFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of removed lists words over all removed words' do
|
|
8
|
+
# inserted: total 7 words, 1 vulgarism, 1 biased, 2 pronouns = 4 bad
|
|
9
|
+
old_text = Text.new('Your old shit. Fuck you great, you and the others.')
|
|
10
|
+
new_text = Text.new('Your old shit.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 4.0 / 7.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 for an empty removed clean text' do
|
|
20
|
+
old_text = Text.new('Your old shit. {{speedy deletion}}')
|
|
21
|
+
new_text = Text.new('Your old shit.')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedBadFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of removed bad words over all removed words' do
|
|
8
|
+
# inserted: total 10 words, 4 biased
|
|
9
|
+
old_text = Text.new('666 old. It’s 666 man, this is 666, 666 a whatever.')
|
|
10
|
+
new_text = Text.new('666 old.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 4.0 / 9.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 for an emtpy removed clean text' do
|
|
20
|
+
old_text = Text.new('whatever old. {{speedy deletion}}')
|
|
21
|
+
new_text = Text.new('whatever old. whatever new.')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedBiasedFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of removed biased words over removed words count' do
|
|
8
|
+
# inserted: total 7 words, 3 biased (great, really, classic)
|
|
9
|
+
old_text = Text.new('Great old. This is so great, really a classic.')
|
|
10
|
+
new_text = Text.new('Great old.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 3.0 / 7.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 on emtpy removed clean text' do
|
|
20
|
+
old_text = Text.new('Great old. {{speedy deletion}}')
|
|
21
|
+
new_text = Text.new('Great old. Great new.')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedCharacterDistribution do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the KL-Divergence of the removed characters distribution' do
|
|
8
|
+
old_text = Text.new('old text [[new inserted text]] given dero 9')
|
|
9
|
+
new_text = Text.new('old text')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 1.6609633564650683
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns a missing value if no alphanumeric characters were removed' do
|
|
19
|
+
old_text = Text.new('old text !* [[?]]')
|
|
20
|
+
new_text = Text.new('old text')
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_text)
|
|
24
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'returns a missing value if no text was inserted' do
|
|
30
|
+
old_text = Text.new('text')
|
|
31
|
+
new_text = Text.new('deletion text')
|
|
32
|
+
|
|
33
|
+
old_rev = build(:old_revision, text: old_text)
|
|
34
|
+
new_rev = build(:new_revision, text: new_text)
|
|
35
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
36
|
+
|
|
37
|
+
expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedEmoticonsFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of removed emoticon words over all removed words' do
|
|
8
|
+
# inserted: total 6 words, 2 emoticons
|
|
9
|
+
old_text = Text.new(':) old. It’s :P man:Pio, this is X-D.')
|
|
10
|
+
new_text = Text.new(':) old.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 2.0 / 6.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 on emtpy removed text' do
|
|
20
|
+
old_text = Text.new('Great old. {{speedy deletion}}')
|
|
21
|
+
new_text = Text.new('Great old. {{speedy deletion}} :)')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedMarkupFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of removed markup words over all removed words' do
|
|
8
|
+
# inserted: total 5 removed words, 2 markup
|
|
9
|
+
old_text = Text.new('[[Great]] old. It is [[Great]] man, [[amazing]].')
|
|
10
|
+
new_text = Text.new('[[Great]] old.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 2.0 / 5.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 on emtpy removed text' do
|
|
20
|
+
old_text = Text.new('Great old. {{speedy deletion}}')
|
|
21
|
+
new_text = Text.new('Great old. {{speedy deletion}} [[heading]]')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedPronounFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of removed pronouns over all removed words' do
|
|
8
|
+
# total 10 words, 6 pronouns
|
|
9
|
+
old_text = Text.new('Your old. I was you if You was We are ourselves us.')
|
|
10
|
+
new_text = Text.new('Your old.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 6.0 / 10.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 for an emtpy removed clean text in the new revision' do
|
|
20
|
+
old_text = Text.new('Your old. {{speedy deletion}}')
|
|
21
|
+
new_text = Text.new('Your old. My inserted.')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedSexFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of removed sex words over all removed words' do
|
|
8
|
+
# inserted: total 7 words, 3 sex words
|
|
9
|
+
old_text = Text.new('Penis old. It’s Penis man, this is penis, anal.')
|
|
10
|
+
new_text = Text.new('Penis old.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 3.0 / 7.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 on emtpy removed clean text' do
|
|
20
|
+
old_text = Text.new('penis old. {{speedy deletion}}')
|
|
21
|
+
new_text = Text.new('penis old. Penis new.')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedSize do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the size of the new removed text' do
|
|
8
|
+
old_text = Text.new('123 456789')
|
|
9
|
+
new_text = Text.new('123') # 6 removed
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 6
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0 if no removed text ' do
|
|
19
|
+
old_text = Text.new('123')
|
|
20
|
+
new_text = Text.new('123 456789') # 0 removed
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_text)
|
|
24
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq 0
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedVulgarismFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of removed vulgarism over all removed words' do
|
|
8
|
+
# total 8 words, 3 vulgarism
|
|
9
|
+
old_text = Text.new('Old shit. Fuck, fu*ck you $lut, and all the others.')
|
|
10
|
+
new_text = Text.new('Old shit. New shit.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 3.0 / 8.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 on emtpy removed clean text revisions' do
|
|
20
|
+
old_text = Text.new('Old shit. {{speedy deletion}}')
|
|
21
|
+
new_text = Text.new('Old shit. New shit.')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::RemovedWords do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of the edit’s removed words' do
|
|
8
|
+
old_text = Text.new('zero one two three four five six') # 6 removed
|
|
9
|
+
new_text = Text.new('zero')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 6
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0 if no text was removed' do
|
|
19
|
+
old_text = Text.new('zero') # 0 removed
|
|
20
|
+
new_text = Text.new('zero one')
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_text)
|
|
24
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq 0
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::ReplacementSimilarity do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the similarity of the deleted text to inserted in exchange' do
|
|
8
|
+
old_text = Text.new('this is Mr. Dixon')
|
|
9
|
+
new_text = Text.new('this is Mr. Dicksonx')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 0.8133333333333332
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0 if the old revision text is empty' do
|
|
20
|
+
old_rev = build(:old_revision, text: '')
|
|
21
|
+
new_rev = build(:new_revision, text: '{{speedy deletion}}')
|
|
22
|
+
|
|
23
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
24
|
+
|
|
25
|
+
expect(subject.calculate(edit)).to eq 0
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
it 'returns 0 if the new revision text is empty' do
|
|
29
|
+
old_rev = build(:old_revision, text: '{{speedy deletion}}')
|
|
30
|
+
new_rev = build(:new_revision, text: '')
|
|
31
|
+
|
|
32
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
33
|
+
|
|
34
|
+
expect(subject.calculate(edit)).to eq 0
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::Reverted do
|
|
4
|
+
it { is_expected.to be_a Features::ContainsBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
%w[rvt rvv revert].each do |term|
|
|
8
|
+
it "returns 1 if the edit comment includes '#{term}'" do
|
|
9
|
+
comment = Text.new("#{term} edited")
|
|
10
|
+
new_rev = build(:new_revision, comment: comment)
|
|
11
|
+
edit = build(:edit, new_revision: new_rev)
|
|
12
|
+
|
|
13
|
+
expect(subject.calculate(edit)).to eq 1
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it 'returns 0 for an emtpy comment' do
|
|
18
|
+
new_rev = build(:new_revision, comment: '')
|
|
19
|
+
edit = build(:edit, new_revision: new_rev)
|
|
20
|
+
|
|
21
|
+
expect(subject.calculate(edit)).to eq 0
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|