wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::CommentBiasedFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of biased words in comment over all words' do
|
|
8
|
+
# total 10 words, 3 biased
|
|
9
|
+
comment = Text.new('It’s Great man, this is amazing, really a classic.')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision)
|
|
12
|
+
new_rev = build(:new_revision, comment: comment)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 4.0 / 9.0
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0.0 for an emtpy clean text comment in the new revision' do
|
|
19
|
+
comment = Text.new('{{speedy deletion}}')
|
|
20
|
+
|
|
21
|
+
old_rev = build(:old_revision)
|
|
22
|
+
new_rev = build(:new_revision, comment: comment)
|
|
23
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
24
|
+
|
|
25
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::CommentLength do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the length of the new revisions comment' do
|
|
8
|
+
comment = Text.new('1 34567 9')
|
|
9
|
+
edit = build :edit, new_revision: build(:new_revision, comment: comment)
|
|
10
|
+
|
|
11
|
+
expect(subject.calculate(edit)).to eq 9
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
it 'returns 0 on emtpy clean text' do
|
|
15
|
+
text = Text.new('{{speedy deletion}}')
|
|
16
|
+
edit = build :edit, new_revision: build(:new_revision, text: text)
|
|
17
|
+
|
|
18
|
+
expect(subject.calculate(edit)).to eq 0
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::CommentMarkupFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of markup words in comment over all words' do
|
|
8
|
+
# total 7 words, 3 markup
|
|
9
|
+
comment = Text.new('[[Content]] is not always {{simple}} to [[produce]]')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision)
|
|
12
|
+
new_rev = build(:new_revision, comment: comment)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 3.0 / 7.0
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0.0 on emtpy text comment' do
|
|
19
|
+
comment = Text.new
|
|
20
|
+
|
|
21
|
+
old_rev = build(:old_revision)
|
|
22
|
+
new_rev = build(:new_revision, comment: comment)
|
|
23
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
24
|
+
|
|
25
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::CommentPronounFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of pronouns in comment over all words count' do
|
|
8
|
+
# total 12 words, 7 pronouns
|
|
9
|
+
comment = Text.new('I was you if You was Me and we are ourselves us')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision)
|
|
12
|
+
new_rev = build(:new_revision, comment: comment)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 7.0 / 12.0
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0.0 on emtpy clean text comment' do
|
|
19
|
+
comment = Text.new('{{speedy deletion}}')
|
|
20
|
+
|
|
21
|
+
old_rev = build(:old_revision)
|
|
22
|
+
new_rev = build(:new_revision, comment: comment)
|
|
23
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
24
|
+
|
|
25
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::CommentSexFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of sex words in comment over all words' do
|
|
8
|
+
# total 9 words, 5 sex words
|
|
9
|
+
comment = Text.new('Penis was penis if penis was penis and anal')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision)
|
|
12
|
+
new_rev = build(:new_revision, comment: comment)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 5.0 / 9.0
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0.0 on emtpy clean text comment' do
|
|
19
|
+
comment = Text.new('{{speedy deletion}}')
|
|
20
|
+
|
|
21
|
+
old_rev = build(:old_revision)
|
|
22
|
+
new_rev = build(:new_revision, comment: comment)
|
|
23
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
24
|
+
|
|
25
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::CommentVulgarismFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::FrequencyBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of vulgarism words in comment over all words' do
|
|
8
|
+
# total 7 words, 2 vulgarism
|
|
9
|
+
comment = Text.new('Fuck you bitch. This is my change!')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision)
|
|
12
|
+
new_rev = build(:new_revision, comment: comment)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 2.0 / 7.0
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0.0 on emtpy clean text comment' do
|
|
19
|
+
comment = Text.new('{{speedy deletion}}')
|
|
20
|
+
|
|
21
|
+
old_rev = build(:old_revision)
|
|
22
|
+
new_rev = build(:new_revision, comment: comment)
|
|
23
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
24
|
+
|
|
25
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
require 'zlib'
|
|
3
|
+
|
|
4
|
+
describe Wikipedia::VandalismDetection::Features::Compressibility do
|
|
5
|
+
it { is_expected.to be_a Features::Base }
|
|
6
|
+
|
|
7
|
+
describe '#calculate' do
|
|
8
|
+
it 'returns the ratio of compressed text size to uncompressed text size' do
|
|
9
|
+
old_text = 'text'
|
|
10
|
+
new_text = 'text [[If this is a quite long textpart]] of normal words ' \
|
|
11
|
+
'then it might be less possible to be a vandalism.'
|
|
12
|
+
|
|
13
|
+
old_rev = build(:old_revision, text: Text.new(old_text))
|
|
14
|
+
new_rev = build(:new_revision, text: Text.new(new_text))
|
|
15
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
16
|
+
|
|
17
|
+
bytesize = 10.0
|
|
18
|
+
|
|
19
|
+
allow(Zlib::Deflate).to receive(:deflate).and_return(Text.new)
|
|
20
|
+
allow_any_instance_of(Text).to receive(:bytesize).and_return(bytesize)
|
|
21
|
+
|
|
22
|
+
ratio = bytesize / (bytesize + bytesize)
|
|
23
|
+
|
|
24
|
+
expect(subject.calculate(edit)).to eq ratio
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'returns 0.5 on emtpy inserted text' do
|
|
28
|
+
old_text = Text.new('deletion text')
|
|
29
|
+
new_text = Text.new(' text')
|
|
30
|
+
|
|
31
|
+
old_rev = build(:old_revision, text: old_text)
|
|
32
|
+
new_rev = build(:new_revision, text: new_text)
|
|
33
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
34
|
+
|
|
35
|
+
expect(subject.calculate(edit)).to eq 0.5
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::ContainsBase do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#contains' do
|
|
7
|
+
it 'returns 1 if a given text contains the given terms array' do
|
|
8
|
+
text = 'Content including text'
|
|
9
|
+
expect(subject.contains(text, %w[content anything])).to eq 1
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it 'returns 1 if a given text contains the given string' do
|
|
13
|
+
text = 'Content including text'
|
|
14
|
+
expect(subject.contains(text, 'content')).to eq 1
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it 'returns 0 if a given text does not contain the given string' do
|
|
18
|
+
text = 'not containing anything con tent'
|
|
19
|
+
expect(subject.contains(text, 'content')).to eq 0
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it 'returns 0 if a given text does not contain any of the given terms' do
|
|
23
|
+
text = 'not containing anything con tent'
|
|
24
|
+
expect(subject.contains(text, %w[content text])).to eq 0
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::Copyedit do
|
|
4
|
+
it { is_expected.to be_a Features::ContainsBase }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns 1 if the edit comment includes "copyedit"' do
|
|
8
|
+
comment = Text.new('copyediting content')
|
|
9
|
+
new_rev = build(:new_revision, comment: comment)
|
|
10
|
+
edit = build(:edit, new_revision: new_rev)
|
|
11
|
+
|
|
12
|
+
expect(subject.calculate(edit)).to eq 1
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
it 'returns 1 if the edit comment includes "copy edit"' do
|
|
16
|
+
comment = Text.new('copy editing content')
|
|
17
|
+
new_rev = build(:new_revision, comment: comment)
|
|
18
|
+
edit = build(:edit, new_revision: new_rev)
|
|
19
|
+
|
|
20
|
+
expect(subject.calculate(edit)).to eq 1
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it 'returns 0 for emtpy an comment in new revision' do
|
|
24
|
+
new_rev = build(:new_revision, comment: '')
|
|
25
|
+
edit = build :edit, new_revision: new_rev
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::DigitRatio do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the digit to all letters ratio for the new inserted text' do
|
|
8
|
+
old_text = Text.new('text1')
|
|
9
|
+
# 3 digit letters of total 8 letters
|
|
10
|
+
new_text = Text.new('text1 [[1A4 B6 8Cd]]')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq((1.0 + 4) / (1.0 + 8))
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 if no text was inserted' do
|
|
20
|
+
old_text = Text.new('deletion text')
|
|
21
|
+
new_text = Text.new('text')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::EditsPerUser do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
describe 'online' do
|
|
8
|
+
it 'returns the number of previous edits from same IP or ID' do
|
|
9
|
+
# https://en.wikipedia.org/w/api.php?action=query&format=json&list=usercontribs&ucuser=<name or ip>&ucprop=ids
|
|
10
|
+
old_rev = build(:old_revision, id: '527136737')
|
|
11
|
+
new_rev = build(
|
|
12
|
+
:new_revision,
|
|
13
|
+
id: '527137015',
|
|
14
|
+
parent_id: '527136737',
|
|
15
|
+
contributor: '142.11.81.219',
|
|
16
|
+
timestamp: '2012-12-09T05:30:07Z'
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
20
|
+
|
|
21
|
+
expect(subject.calculate(edit)).to eq 1
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
describe 'offline' do
|
|
26
|
+
before do
|
|
27
|
+
page = build(:page, id: '1234', title: 'Page Title')
|
|
28
|
+
|
|
29
|
+
# contributor: see factories/page.rb !
|
|
30
|
+
old_rev = build(:new_revision, contributor: 'User')
|
|
31
|
+
new_rev = build(:even_newer_revision, contributor: 'User')
|
|
32
|
+
|
|
33
|
+
@edit = build(
|
|
34
|
+
:edit,
|
|
35
|
+
old_revision: old_rev,
|
|
36
|
+
new_revision: new_rev,
|
|
37
|
+
page: page
|
|
38
|
+
)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it 'does not use an API call if the edit has a page reference' do
|
|
42
|
+
expect(Wikipedia).to_not receive :api_request
|
|
43
|
+
subject.calculate(@edit)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it 'returns the number of previous edits from the same IP or ID' do
|
|
47
|
+
expect(subject.calculate(@edit)).to eq 1
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::EmoticonsFrequency do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of emoticons over all words' do
|
|
8
|
+
# total 8 words, 3 emoticons
|
|
9
|
+
old_text = Text.new('Old :-).')
|
|
10
|
+
new_text = Text.new('Old :-). ;) love icons and emoticons? :D :P, yeah.')
|
|
11
|
+
|
|
12
|
+
old_rev = build(:old_revision, text: old_text)
|
|
13
|
+
new_rev = build(:new_revision, text: new_text)
|
|
14
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
15
|
+
|
|
16
|
+
expect(subject.calculate(edit)).to eq 3.0 / 8.0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'returns 0.0 on emtpy clean text revisions' do
|
|
20
|
+
old_text = Text.new('Old :-).')
|
|
21
|
+
new_text = Text.new('Old :-). {{speedy deletion}}')
|
|
22
|
+
|
|
23
|
+
old_rev = build(:old_revision, text: old_text)
|
|
24
|
+
new_rev = build(:new_revision, text: new_text)
|
|
25
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
26
|
+
|
|
27
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::EmoticonsImpact do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the impact of emoticon words of the new revision text' do
|
|
8
|
+
# 3 emoticons
|
|
9
|
+
old_text = ':) Hi you I got some :-X, you know ;)'
|
|
10
|
+
|
|
11
|
+
# 4 emoticons
|
|
12
|
+
new_text = ':) Hi (=you) I added another :-X you know ;)? (='
|
|
13
|
+
|
|
14
|
+
old_rev = build(:old_revision, text: old_text)
|
|
15
|
+
new_rev = build(:new_revision, text: new_text)
|
|
16
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
17
|
+
|
|
18
|
+
expect(subject.calculate(edit)).to eq 3.0 / (3.0 + 4.0)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it 'returns 0.5 if both text revisions have no terms' do
|
|
22
|
+
old_rev = build(:old_revision, text: '')
|
|
23
|
+
new_rev = build(:new_revision, text: '')
|
|
24
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq 0.5
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'returns 0.0 for an emtpy text in the old revision' do
|
|
30
|
+
old_rev = build(:old_revision, text: '')
|
|
31
|
+
new_rev = build(:new_revision, text: ':)')
|
|
32
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
33
|
+
|
|
34
|
+
expect(subject.calculate(edit)).to eq 0.0
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'returns 1.0 for an emtpy text in the new revision' do
|
|
38
|
+
old_rev = build(:old_revision, text: ':)')
|
|
39
|
+
new_rev = build(:new_revision, text: '')
|
|
40
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
41
|
+
|
|
42
|
+
expect(subject.calculate(edit)).to eq 1.0
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::FrequencyBase do
|
|
4
|
+
let(:terms) { Wikipedia::VandalismDetection::WordLists::PRONOUNS }
|
|
5
|
+
|
|
6
|
+
it { is_expected.to be_a Features::Base }
|
|
7
|
+
|
|
8
|
+
describe '#frequency' do
|
|
9
|
+
it { is_expected.to respond_to :frequency }
|
|
10
|
+
|
|
11
|
+
it 'returns the frequency in percentage of given word counts' do
|
|
12
|
+
text = 'I am, i like you.'
|
|
13
|
+
expect(subject.frequency(text, terms)).to eq 3.0 / 5.0
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it 'returns 0.0 if total word count is zero' do
|
|
17
|
+
expect(subject.frequency('', terms)).to eq 0.0
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
require 'wikipedia/vandalism_detection/features/impact_base'
|
|
3
|
+
|
|
4
|
+
describe Wikipedia::VandalismDetection::Features::ImpactBase do
|
|
5
|
+
let(:pronouns) { Wikipedia::VandalismDetection::WordLists::PRONOUNS }
|
|
6
|
+
|
|
7
|
+
it { is_expected.to be_a Features::Base }
|
|
8
|
+
|
|
9
|
+
describe '#impact' do
|
|
10
|
+
it { is_expected.to respond_to :impact }
|
|
11
|
+
|
|
12
|
+
it 'returns the impact in % of given terms in old realitve to new text' do
|
|
13
|
+
# 3 pronouns
|
|
14
|
+
old_text = 'Your old text will be mine or Yours'
|
|
15
|
+
# 4 pronouns
|
|
16
|
+
new_text = 'My new text and your old text will be ours and mine'
|
|
17
|
+
|
|
18
|
+
expect(subject.impact(old_text, new_text, pronouns))
|
|
19
|
+
.to eq 3.0 / (3.0 + 4.0)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it 'returns 0.0 if old terms word count is zero' do
|
|
23
|
+
new_text = 'My new text and your old text will be ours and mine'
|
|
24
|
+
expect(subject.impact('', new_text, pronouns)).to eq 0.0
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'returns 1.0 if new terms word count is zero' do
|
|
28
|
+
old_text = 'My new text and your old text will be ours and mine'
|
|
29
|
+
expect(subject.impact(old_text, '', pronouns)).to eq 1.0
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
it 'returns 0.5 if both terms word count is zero' do
|
|
33
|
+
expect(subject.impact('', '', pronouns)).to eq 0.5
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::InsertedCharacterDistribution do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the KL-Divergence of the inserted characters distribution' do
|
|
8
|
+
old_text = Text.new('old text')
|
|
9
|
+
new_text = Text.new('old text [[new inserted text]] given dero 9')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 1.6609633564650683
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns missing value if no alphanumeric characters were inserted' do
|
|
19
|
+
old_text = Text.new('old text')
|
|
20
|
+
new_text = Text.new('old text !* [[?]]')
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_text)
|
|
24
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'returns missing value if no text was inserted' do
|
|
30
|
+
old_text = Text.new('deletion text')
|
|
31
|
+
new_text = Text.new('text')
|
|
32
|
+
|
|
33
|
+
old_rev = build(:old_revision, text: old_text)
|
|
34
|
+
new_rev = build(:new_revision, text: new_text)
|
|
35
|
+
edit = build(:edit, new_revision: new_rev, old_revision: old_rev)
|
|
36
|
+
|
|
37
|
+
expect(subject.calculate(edit)).to eq Features::MISSING_VALUE
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::InsertedExternalLinks do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of inserted external links' do
|
|
8
|
+
old_text = Text.new('123')
|
|
9
|
+
new_text = Text.new('123 [http://google.com Google] https://example.com')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 2
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0 if no text was inserted' do
|
|
19
|
+
old_text = Text.new('123 456789')
|
|
20
|
+
new_text = Text.new('123') # 0 inserted
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_text)
|
|
24
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq 0
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::InsertedInternalLinks do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of inserted internal links' do
|
|
8
|
+
old_text = Text.new('123')
|
|
9
|
+
new_text = Text.new('123 [[link]] [[linkname|link]]')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 2
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0 if no inserted text' do
|
|
19
|
+
old_text = Text.new('123 456789')
|
|
20
|
+
new_text = Text.new('123') # 0 inserted
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_text)
|
|
24
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq 0
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::InsertedSize do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the size of the new revisions inserted text sizes' do
|
|
8
|
+
old_text = Text.new('123')
|
|
9
|
+
new_text = Text.new('123 456789') # 6 inserted
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 6
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0 if no inserted text' do
|
|
19
|
+
old_text = Text.new('123 456789')
|
|
20
|
+
new_text = Text.new('123') # 0 inserted
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_text)
|
|
24
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq 0
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::InsertedWords do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the number of the inserted words' do
|
|
8
|
+
old_text = Text.new('zero')
|
|
9
|
+
new_text = Text.new('zero one two three four five six')
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 6
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0 if no inserted text' do
|
|
19
|
+
old_text = Text.new('zero one')
|
|
20
|
+
new_text = Text.new('zero')
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_text)
|
|
24
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq 0
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Features::LongestWord do
|
|
4
|
+
it { is_expected.to be_a Features::Base }
|
|
5
|
+
|
|
6
|
+
describe '#calculate' do
|
|
7
|
+
it 'returns the length of the longest word in the new revision text' do
|
|
8
|
+
old_text = Text.new('1 7777777')
|
|
9
|
+
new_text = Text.new("1 7777777 22 a2c4e 333 55555\n======head======\nfff")
|
|
10
|
+
|
|
11
|
+
old_rev = build(:old_revision, text: old_text)
|
|
12
|
+
new_rev = build(:new_revision, text: new_text)
|
|
13
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
14
|
+
|
|
15
|
+
expect(subject.calculate(edit)).to eq 5
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'returns 0 on non inserted clean text' do
|
|
19
|
+
old_text = Text.new('1 22')
|
|
20
|
+
new_text = Text.new('1 22 {{speedy deletion}}')
|
|
21
|
+
|
|
22
|
+
old_rev = build(:old_revision, text: old_text)
|
|
23
|
+
new_rev = build(:new_revision, text: new_text)
|
|
24
|
+
edit = build(:edit, old_revision: old_rev, new_revision: new_rev)
|
|
25
|
+
|
|
26
|
+
expect(subject.calculate(edit)).to eq 0
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|